OLD | NEW |
(Empty) | |
| 1 // |
| 2 // file: regexcmp.cpp |
| 3 // |
| 4 // Copyright (C) 2002-2010 International Business Machines Corporation and othe
rs. |
| 5 // All Rights Reserved. |
| 6 // |
| 7 // This file contains the ICU regular expression compiler, which is responsible |
| 8 // for processing a regular expression pattern into the compiled form that |
| 9 // is used by the match finding engine. |
| 10 // |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 15 |
| 16 #include "unicode/ustring.h" |
| 17 #include "unicode/unistr.h" |
| 18 #include "unicode/uniset.h" |
| 19 #include "unicode/uchar.h" |
| 20 #include "unicode/uchriter.h" |
| 21 #include "unicode/parsepos.h" |
| 22 #include "unicode/parseerr.h" |
| 23 #include "unicode/regex.h" |
| 24 #include "util.h" |
| 25 #include "putilimp.h" |
| 26 #include "cmemory.h" |
| 27 #include "cstring.h" |
| 28 #include "uvectr32.h" |
| 29 #include "uvectr64.h" |
| 30 #include "uassert.h" |
| 31 #include "ucln_in.h" |
| 32 #include "uinvchar.h" |
| 33 |
| 34 #include "regeximp.h" |
| 35 #include "regexcst.h" // Contains state table for the regex pattern parser. |
| 36 // generated by a Perl script. |
| 37 #include "regexcmp.h" |
| 38 #include "regexst.h" |
| 39 #include "regextxt.h" |
| 40 |
| 41 |
| 42 |
| 43 U_NAMESPACE_BEGIN |
| 44 |
| 45 |
| 46 //------------------------------------------------------------------------------ |
| 47 // |
| 48 // Constructor. |
| 49 // |
| 50 //------------------------------------------------------------------------------ |
| 51 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : |
| 52 fParenStack(status), fSetStack(status), fSetOpStack(status) |
| 53 { |
| 54 // Lazy init of all shared global sets (needed for init()'s empty text) |
| 55 RegexStaticSets::initGlobals(&status); |
| 56 |
| 57 fStatus = &status; |
| 58 |
| 59 fRXPat = rxp; |
| 60 fScanIndex = 0; |
| 61 fLastChar = -1; |
| 62 fPeekChar = -1; |
| 63 fLineNum = 1; |
| 64 fCharNum = 0; |
| 65 fQuoteMode = FALSE; |
| 66 fInBackslashQuote = FALSE; |
| 67 fModeFlags = fRXPat->fFlags | 0x80000000; |
| 68 fEOLComments = TRUE; |
| 69 |
| 70 fMatchOpenParen = -1; |
| 71 fMatchCloseParen = -1; |
| 72 fStringOpStart = -1; |
| 73 |
| 74 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { |
| 75 status = rxp->fDeferredStatus; |
| 76 } |
| 77 } |
| 78 |
| 79 static const UChar chAmp = 0x26; // '&' |
| 80 static const UChar chDash = 0x2d; // '-' |
| 81 |
| 82 |
| 83 //------------------------------------------------------------------------------ |
| 84 // |
| 85 // Destructor |
| 86 // |
| 87 //------------------------------------------------------------------------------ |
| 88 RegexCompile::~RegexCompile() { |
| 89 } |
| 90 |
| 91 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { |
| 92 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK,
value, ec)); |
| 93 } |
| 94 |
| 95 //------------------------------------------------------------------------------ |
| 96 // |
| 97 // Compile regex pattern. The state machine for rexexp pattern parsing is her
e. |
| 98 // The state tables are hand-written in the file regex
cst.txt, |
| 99 // and converted to the form used here by a perl |
| 100 // script regexcst.pl |
| 101 // |
| 102 //------------------------------------------------------------------------------ |
| 103 void RegexCompile::compile( |
| 104 const UnicodeString &pat, // Source pat to be compile
d. |
| 105 UParseError &pp, // Error position info |
| 106 UErrorCode &e) // Error Code |
| 107 { |
| 108 fRXPat->fPatternString = new UnicodeString(pat); |
| 109 UText patternText = UTEXT_INITIALIZER; |
| 110 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); |
| 111 |
| 112 if (U_SUCCESS(e)) { |
| 113 compile(&patternText, pp, e); |
| 114 utext_close(&patternText); |
| 115 } |
| 116 } |
| 117 |
| 118 // |
| 119 // compile, UText mode |
| 120 // All the work is actually done here. |
| 121 // |
| 122 void RegexCompile::compile( |
| 123 UText *pat, // Source pat to be compile
d. |
| 124 UParseError &pp, // Error position info |
| 125 UErrorCode &e) // Error Code |
| 126 { |
| 127 fStatus = &e; |
| 128 fParseErr = &pp; |
| 129 fStackPtr = 0; |
| 130 fStack[fStackPtr] = 0; |
| 131 |
| 132 if (U_FAILURE(*fStatus)) { |
| 133 return; |
| 134 } |
| 135 |
| 136 // There should be no pattern stuff in the RegexPattern object. They can no
t be reused. |
| 137 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) ==
0); |
| 138 |
| 139 // Prepare the RegexPattern object to receive the compiled pattern. |
| 140 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fS
tatus); |
| 141 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; |
| 142 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; |
| 143 |
| 144 |
| 145 // Initialize the pattern scanning state machine |
| 146 fPatternLength = utext_nativeLength(pat); |
| 147 uint16_t state = 1; |
| 148 const RegexTableEl *tableEl; |
| 149 nextChar(fC); // Fetch the first char from the patter
n string. |
| 150 |
| 151 // |
| 152 // Main loop for the regex pattern parsing state machine. |
| 153 // Runs once per state transition. |
| 154 // Each time through optionally performs, depending on the state table, |
| 155 // - an advance to the the next pattern char |
| 156 // - an action to be performed. |
| 157 // - pushing or popping a state to/from the local state return stack. |
| 158 // file regexcst.txt is the source for the state table. The logic behind |
| 159 // recongizing the pattern syntax is there, not here. |
| 160 // |
| 161 for (;;) { |
| 162 // Bail out if anything has gone wrong. |
| 163 // Regex pattern parsing stops on the first error encountered. |
| 164 if (U_FAILURE(*fStatus)) { |
| 165 break; |
| 166 } |
| 167 |
| 168 U_ASSERT(state != 0); |
| 169 |
| 170 // Find the state table element that matches the input char from the pat
tern, or the |
| 171 // class of the input character. Start with the first table row for
this |
| 172 // state, then linearly scan forward until we find a row that matches
the |
| 173 // character. The last row for each state always matches all charact
ers, so |
| 174 // the search will stop there, if not before. |
| 175 // |
| 176 tableEl = &gRuleParseStateTable[state]; |
| 177 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s
", |
| 178 fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); |
| 179 |
| 180 for (;;) { // loop through table rows belonging to this state, lookin
g for one |
| 181 // that matches the current input char. |
| 182 REGEX_SCAN_DEBUG_PRINTF((".")); |
| 183 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->f
CharClass == fC.fChar) { |
| 184 // Table row specified an individual character, not a set, and |
| 185 // the input character is not quoted, and |
| 186 // the input character matched it. |
| 187 break; |
| 188 } |
| 189 if (tableEl->fCharClass == 255) { |
| 190 // Table row specified default, match anything character class. |
| 191 break; |
| 192 } |
| 193 if (tableEl->fCharClass == 254 && fC.fQuoted) { |
| 194 // Table row specified "quoted" and the char was quoted. |
| 195 break; |
| 196 } |
| 197 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { |
| 198 // Table row specified eof and we hit eof on the input. |
| 199 break; |
| 200 } |
| 201 |
| 202 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && //
Table specs a char class && |
| 203 fC.fQuoted == FALSE && //
char is not escaped && |
| 204 fC.fChar != (UChar32)-1) { //
char is not EOF |
| 205 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-
128].contains(fC.fChar)) { |
| 206 // Table row specified a character class, or set of characte
rs, |
| 207 // and the current char matches it. |
| 208 break; |
| 209 } |
| 210 } |
| 211 |
| 212 // No match on this row, advance to the next row for this state, |
| 213 tableEl++; |
| 214 } |
| 215 REGEX_SCAN_DEBUG_PRINTF(("\n")); |
| 216 |
| 217 // |
| 218 // We've found the row of the state table that matches the current input |
| 219 // character from the rules string. |
| 220 // Perform any action specified by this row in the state table. |
| 221 if (doParseActions(tableEl->fAction) == FALSE) { |
| 222 // Break out of the state machine loop if the |
| 223 // the action signalled some kind of error, or |
| 224 // the action was to exit, occurs on normal end-of-rules-input. |
| 225 break; |
| 226 } |
| 227 |
| 228 if (tableEl->fPushState != 0) { |
| 229 fStackPtr++; |
| 230 if (fStackPtr >= kStackSize) { |
| 231 error(U_REGEX_INTERNAL_ERROR); |
| 232 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack ov
erflow.\n")); |
| 233 fStackPtr--; |
| 234 } |
| 235 fStack[fStackPtr] = tableEl->fPushState; |
| 236 } |
| 237 |
| 238 // |
| 239 // NextChar. This is where characters are actually fetched from the pa
ttern. |
| 240 // Happens under control of the 'n' tag in the state table. |
| 241 // |
| 242 if (tableEl->fNextChar) { |
| 243 nextChar(fC); |
| 244 } |
| 245 |
| 246 // Get the next state from the table entry, or from the |
| 247 // state stack if the next state was specified as "pop". |
| 248 if (tableEl->fNextState != 255) { |
| 249 state = tableEl->fNextState; |
| 250 } else { |
| 251 state = fStack[fStackPtr]; |
| 252 fStackPtr--; |
| 253 if (fStackPtr < 0) { |
| 254 // state stack underflow |
| 255 // This will occur if the user pattern has mis-matched parenthes
es, |
| 256 // with extra close parens. |
| 257 // |
| 258 fStackPtr++; |
| 259 error(U_REGEX_MISMATCHED_PAREN); |
| 260 } |
| 261 } |
| 262 |
| 263 } |
| 264 |
| 265 if (U_FAILURE(*fStatus)) { |
| 266 // Bail out if the pattern had errors. |
| 267 // Set stack cleanup: a successful compile would have left it empty, |
| 268 // but errors can leave temporary sets hanging around. |
| 269 while (!fSetStack.empty()) { |
| 270 delete (UnicodeSet *)fSetStack.pop(); |
| 271 } |
| 272 return; |
| 273 } |
| 274 |
| 275 // |
| 276 // The pattern has now been read and processed, and the compiled code genera
ted. |
| 277 // |
| 278 |
| 279 // |
| 280 // Compute the number of digits requried for the largest capture group numbe
r. |
| 281 // |
| 282 fRXPat->fMaxCaptureDigits = 1; |
| 283 int32_t n = 10; |
| 284 int32_t groupCount = fRXPat->fGroupMap->size(); |
| 285 while (n <= groupCount) { |
| 286 fRXPat->fMaxCaptureDigits++; |
| 287 n *= 10; |
| 288 } |
| 289 |
| 290 // |
| 291 // The pattern's fFrameSize so far has accumulated the requirements for |
| 292 // storage for capture parentheses, counters, etc. that are encountered |
| 293 // in the pattern. Add space for the two variables that are always |
| 294 // present in the saved state: the input string position (int64_t) and |
| 295 // the position in the compiled pattern. |
| 296 // |
| 297 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; |
| 298 |
| 299 // |
| 300 // Optimization pass 1: NOPs, back-references, and case-folding |
| 301 // |
| 302 stripNOPs(); |
| 303 |
| 304 // |
| 305 // Get bounds for the minimum and maximum length of a string that this |
| 306 // pattern can match. Used to avoid looking for matches in strings that |
| 307 // are too short. |
| 308 // |
| 309 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); |
| 310 |
| 311 // |
| 312 // Optimization pass 2: match start type |
| 313 // |
| 314 matchStartType(); |
| 315 |
| 316 // |
| 317 // Set up fast latin-1 range sets |
| 318 // |
| 319 int32_t numSets = fRXPat->fSets->size(); |
| 320 fRXPat->fSets8 = new Regex8BitSet[numSets]; |
| 321 // Null pointer check. |
| 322 if (fRXPat->fSets8 == NULL) { |
| 323 e = *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| 324 return; |
| 325 } |
| 326 int32_t i; |
| 327 for (i=0; i<numSets; i++) { |
| 328 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); |
| 329 fRXPat->fSets8[i].init(s); |
| 330 } |
| 331 |
| 332 } |
| 333 |
| 334 |
| 335 |
| 336 |
| 337 |
| 338 //------------------------------------------------------------------------------ |
| 339 // |
| 340 // doParseAction Do some action during regex pattern parsing. |
| 341 // Called by the parse state machine. |
| 342 // |
| 343 // Generation of the match engine PCode happens here, or |
| 344 // in functions called from the parse actions defined here
. |
| 345 // |
| 346 // |
| 347 //------------------------------------------------------------------------------ |
| 348 UBool RegexCompile::doParseActions(int32_t action) |
| 349 { |
| 350 UBool returnVal = TRUE; |
| 351 |
| 352 switch ((Regex_PatternParseAction)action) { |
| 353 |
| 354 case doPatStart: |
| 355 // Start of pattern compiles to: |
| 356 //0 SAVE 2 Fall back to position of FAIL |
| 357 //1 jmp 3 |
| 358 //2 FAIL Stop if we ever reach here. |
| 359 //3 NOP Dummy, so start of pattern looks the same as |
| 360 // the start of an ( grouping. |
| 361 //4 NOP Resreved, will be replaced by a save if there are |
| 362 // OR | operators at the top level |
| 363 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus)
; |
| 364 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); |
| 365 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); |
| 366 |
| 367 // Standard open nonCapture paren action emits the two NOPs and |
| 368 // sets up the paren stack frame. |
| 369 doParseActions(doOpenNonCaptureParen); |
| 370 break; |
| 371 |
| 372 case doPatFinish: |
| 373 // We've scanned to the end of the pattern |
| 374 // The end of pattern compiles to: |
| 375 // URX_END |
| 376 // which will stop the runtime match engine. |
| 377 // Encountering end of pattern also behaves like a close paren, |
| 378 // and forces fixups of the State Save at the beginning of the compile
d pattern |
| 379 // and of any OR operations at the top level. |
| 380 // |
| 381 handleCloseParen(); |
| 382 if (fParenStack.size() > 0) { |
| 383 // Missing close paren in pattern. |
| 384 error(U_REGEX_MISMATCHED_PAREN); |
| 385 } |
| 386 |
| 387 // add the END operation to the compiled pattern. |
| 388 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); |
| 389 |
| 390 // Terminate the pattern compilation state machine. |
| 391 returnVal = FALSE; |
| 392 break; |
| 393 |
| 394 |
| 395 |
| 396 case doOrOperator: |
| 397 // Scanning a '|', as in (A|B) |
| 398 { |
| 399 // Insert a SAVE operation at the start of the pattern section prece
ding |
| 400 // this OR at this level. This SAVE will branch the match forward |
| 401 // to the right hand side of the OR in the event that the left han
d |
| 402 // side fails to match and backtracks. Locate the position for th
e |
| 403 // save from the location on the top of the parentheses stack. |
| 404 int32_t savePosition = fParenStack.popi(); |
| 405 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; |
| 406 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location |
| 407 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
| 408 fRXPat->fCompiledPat->setElementAt(op, savePosition); |
| 409 |
| 410 // Append an JMP operation into the compiled pattern. The operand f
or |
| 411 // the JMP will eventually be the location following the ')' for th
e |
| 412 // group. This will be patched in later, when the ')' is encounter
ed. |
| 413 op = URX_BUILD(URX_JMP, 0); |
| 414 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 415 |
| 416 // Push the position of the newly added JMP op onto the parentheses
stack. |
| 417 // This registers if for fixup when this block's close paren is enco
untered. |
| 418 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 419 |
| 420 // Append a NOP to the compiled pattern. This is the slot reserved |
| 421 // for a SAVE in the event that there is yet another '|' following |
| 422 // this one. |
| 423 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 424 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 425 } |
| 426 break; |
| 427 |
| 428 |
| 429 case doOpenCaptureParen: |
| 430 // Open Paren. |
| 431 // Compile to a |
| 432 // - NOP, which later may be replaced by a save-state if the |
| 433 // parenthesized group gets a * quantifier, followed by |
| 434 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
| 435 // - NOP, which may later be replaced by a save-state if there |
| 436 // is an '|' alternation within the parens. |
| 437 // |
| 438 // Each capture group gets three slots in the save stack frame: |
| 439 // 0: Capture Group start position (in input string being matche
d.) |
| 440 // 1: Capture Group end position. |
| 441 // 2: Start of Match-in-progress. |
| 442 // The first two locations are for a completed capture group, and are |
| 443 // referred to by back references and the like. |
| 444 // The third location stores the capture start position when an START
_CAPTURE is |
| 445 // encountered. This will be promoted to a completed capture when
(and if) the corresponding |
| 446 // END_CAPTURE is encountered. |
| 447 { |
| 448 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 449 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. |
| 450 fRXPat->fFrameSize += 3; |
| 451 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); |
| 452 fRXPat->fCompiledPat->addElement(cop, *fStatus); |
| 453 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 454 |
| 455 // On the Parentheses stack, start a new frame and add the postions |
| 456 // of the two NOPs. Depending on what follows in the pattern, the |
| 457 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 458 // address of the end of the parenthesized group. |
| 459 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 460 fParenStack.push(capturing, *fStatus); // Fra
me type. |
| 461 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
| 462 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 463 |
| 464 // Save the mapping from group number to stack frame variable positi
on. |
| 465 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
| 466 } |
| 467 break; |
| 468 |
| 469 case doOpenNonCaptureParen: |
| 470 // Open non-caputuring (grouping only) Paren. |
| 471 // Compile to a |
| 472 // - NOP, which later may be replaced by a save-state if the |
| 473 // parenthesized group gets a * quantifier, followed by |
| 474 // - NOP, which may later be replaced by a save-state if there |
| 475 // is an '|' alternation within the parens. |
| 476 { |
| 477 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 478 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 479 |
| 480 // On the Parentheses stack, start a new frame and add the postions |
| 481 // of the two NOPs. |
| 482 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 483 fParenStack.push(plain, *fStatus); // Beg
in a new frame. |
| 484 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 485 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 486 } |
| 487 break; |
| 488 |
| 489 |
| 490 case doOpenAtomicParen: |
| 491 // Open Atomic Paren. (?> |
| 492 // Compile to a |
| 493 // - NOP, which later may be replaced if the parenthesized group |
| 494 // has a quantifier, followed by |
| 495 // - STO_SP save state stack position, so it can be restored at th
e ")" |
| 496 // - NOP, which may later be replaced by a save-state if there |
| 497 // is an '|' alternation within the parens. |
| 498 { |
| 499 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 500 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the |
| 501 fRXPat->fDataSize += 1; // state stack ptr. |
| 502 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); |
| 503 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); |
| 504 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 505 |
| 506 // On the Parentheses stack, start a new frame and add the postions |
| 507 // of the two NOPs. Depending on what follows in the pattern, the |
| 508 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 509 // address of the end of the parenthesized group. |
| 510 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 511 fParenStack.push(atomic, *fStatus); // Fra
me type. |
| 512 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP |
| 513 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
| 514 } |
| 515 break; |
| 516 |
| 517 |
| 518 case doOpenLookAhead: |
| 519 // Positive Look-ahead (?= stuff ) |
| 520 // |
| 521 // Note: Addition of transparent input regions, with the need to |
| 522 // restore the original regions when failing out of a lookahea
d |
| 523 // block, complicated this sequence. Some conbined opcodes |
| 524 // might make sense - or might not, lookahead aren't that comm
on. |
| 525 // |
| 526 // Caution: min match length optimization knows about this |
| 527 // sequence; don't change without making updates there too
. |
| 528 // |
| 529 // Compiles to |
| 530 // 1 START_LA dataLoc Saves SP, Input Pos |
| 531 // 2. STATE_SAVE 4 on failure of lookahead, goto 4 |
| 532 // 3 JMP 6 continue ... |
| 533 // |
| 534 // 4. LA_END Look Ahead failed. Restore regions. |
| 535 // 5. BACKTRACK and back track again. |
| 536 // |
| 537 // 6. NOP reserved for use by quantifiers on the block
. |
| 538 // Look-ahead can't have quantifiers, but paren
stack |
| 539 // compile time conventions require the slot
anyhow. |
| 540 // 7. NOP may be replaced if there is are '|' ops in t
he block. |
| 541 // 8. code for parenthesized stuff. |
| 542 // 9. LA_END |
| 543 // |
| 544 // Two data slots are reserved, for saving the stack ptr and the input
position. |
| 545 { |
| 546 int32_t dataLoc = fRXPat->fDataSize; |
| 547 fRXPat->fDataSize += 2; |
| 548 int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
| 549 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 550 |
| 551 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
| 552 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 553 |
| 554 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
| 555 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 556 |
| 557 op = URX_BUILD(URX_LA_END, dataLoc); |
| 558 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 559 |
| 560 op = URX_BUILD(URX_BACKTRACK, 0); |
| 561 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 562 |
| 563 op = URX_BUILD(URX_NOP, 0); |
| 564 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 565 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 566 |
| 567 // On the Parentheses stack, start a new frame and add the postions |
| 568 // of the NOPs. |
| 569 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 570 fParenStack.push(lookAhead, *fStatus); // Fra
me type. |
| 571 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 572 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
| 573 } |
| 574 break; |
| 575 |
| 576 case doOpenLookAheadNeg: |
| 577 // Negated Lookahead. (?! stuff ) |
| 578 // Compiles to |
| 579 // 1. START_LA dataloc |
| 580 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, |
| 581 // // which continues with the match. |
| 582 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' |
| 583 // 4. code for parenthesized stuff. |
| 584 // 5. END_LA // Cut back stack, remove saved state
from step 2. |
| 585 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. |
| 586 // 7. END_LA // Restore match region, in case look-
ahead was using |
| 587 // an alternate (transparent) reg
ion. |
| 588 { |
| 589 int32_t dataLoc = fRXPat->fDataSize; |
| 590 fRXPat->fDataSize += 2; |
| 591 int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
| 592 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 593 |
| 594 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. |
| 595 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 596 |
| 597 op = URX_BUILD(URX_NOP, 0); |
| 598 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 599 |
| 600 // On the Parentheses stack, start a new frame and add the postions |
| 601 // of the StateSave and NOP. |
| 602 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 603 fParenStack.push(negLookAhead, *fStatus); // Fram
e type |
| 604 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location |
| 605 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
| 606 |
| 607 // Instructions #5 - #7 will be added when the ')' is encountered. |
| 608 } |
| 609 break; |
| 610 |
| 611 case doOpenLookBehind: |
| 612 { |
| 613 // Compile a (?<= look-behind open paren. |
| 614 // |
| 615 // Compiles to |
| 616 // 0 URX_LB_START dataLoc |
| 617 // 1 URX_LB_CONT dataLoc |
| 618 // 2 MinMatchLen |
| 619 // 3 MaxMatchLen |
| 620 // 4 URX_NOP Standard '(' boilerplate. |
| 621 // 5 URX_NOP Reserved slot for use with
'|' ops within (block). |
| 622 // 6 <code for LookBehind expression> |
| 623 // 7 URX_LB_END dataLoc # Check match le
n, restore input len |
| 624 // 8 URX_LA_END dataLoc # Restore stack,
input pos |
| 625 // |
| 626 // Allocate a block of matcher data, to contain (when runni
ng a match) |
| 627 // 0: Stack ptr on entry |
| 628 // 1: Input Index on entry |
| 629 // 2: Start index of match current match attempt. |
| 630 // 3: Original Input String len. |
| 631 |
| 632 // Allocate data space |
| 633 int32_t dataLoc = fRXPat->fDataSize; |
| 634 fRXPat->fDataSize += 4; |
| 635 |
| 636 // Emit URX_LB_START |
| 637 int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
| 638 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 639 |
| 640 // Emit URX_LB_CONT |
| 641 op = URX_BUILD(URX_LB_CONT, dataLoc); |
| 642 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 643 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. |
| 644 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. |
| 645 |
| 646 // Emit the NOP |
| 647 op = URX_BUILD(URX_NOP, 0); |
| 648 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 649 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 650 |
| 651 // On the Parentheses stack, start a new frame and add the postions |
| 652 // of the URX_LB_CONT and the NOP. |
| 653 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 654 fParenStack.push(lookBehind, *fStatus); // Fra
me type |
| 655 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 656 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
| 657 |
| 658 // The final two instructions will be added when the ')' is encounte
red. |
| 659 } |
| 660 |
| 661 break; |
| 662 |
| 663 case doOpenLookBehindNeg: |
| 664 { |
| 665 // Compile a (?<! negated look-behind open paren. |
| 666 // |
| 667 // Compiles to |
| 668 // 0 URX_LB_START dataLoc # Save entry sta
ck, input len |
| 669 // 1 URX_LBN_CONT dataLoc # Iterate possib
le match positions |
| 670 // 2 MinMatchLen |
| 671 // 3 MaxMatchLen |
| 672 // 4 continueLoc (9) |
| 673 // 5 URX_NOP Standard '(' boilerplate. |
| 674 // 6 URX_NOP Reserved slot for use with
'|' ops within (block). |
| 675 // 7 <code for LookBehind expression> |
| 676 // 8 URX_LBN_END dataLoc # Check match le
n, cause a FAIL |
| 677 // 9 ... |
| 678 // |
| 679 // Allocate a block of matcher data, to contain (when runni
ng a match) |
| 680 // 0: Stack ptr on entry |
| 681 // 1: Input Index on entry |
| 682 // 2: Start index of match current match attempt. |
| 683 // 3: Original Input String len. |
| 684 |
| 685 // Allocate data space |
| 686 int32_t dataLoc = fRXPat->fDataSize; |
| 687 fRXPat->fDataSize += 4; |
| 688 |
| 689 // Emit URX_LB_START |
| 690 int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
| 691 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 692 |
| 693 // Emit URX_LBN_CONT |
| 694 op = URX_BUILD(URX_LBN_CONT, dataLoc); |
| 695 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 696 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. |
| 697 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. |
| 698 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. |
| 699 |
| 700 // Emit the NOP |
| 701 op = URX_BUILD(URX_NOP, 0); |
| 702 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 703 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 704 |
| 705 // On the Parentheses stack, start a new frame and add the postions |
| 706 // of the URX_LB_CONT and the NOP. |
| 707 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 708 fParenStack.push(lookBehindN, *fStatus); // Fra
me type |
| 709 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 710 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
| 711 |
| 712 // The final two instructions will be added when the ')' is encounte
red. |
| 713 } |
| 714 break; |
| 715 |
| 716 case doConditionalExpr: |
| 717 // Conditionals such as (?(1)a:b) |
| 718 case doPerlInline: |
| 719 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to
do them. |
| 720 error(U_REGEX_UNIMPLEMENTED); |
| 721 break; |
| 722 |
| 723 |
| 724 case doCloseParen: |
| 725 handleCloseParen(); |
| 726 if (fParenStack.size() <= 0) { |
| 727 // Extra close paren, or missing open paren. |
| 728 error(U_REGEX_MISMATCHED_PAREN); |
| 729 } |
| 730 break; |
| 731 |
| 732 case doNOP: |
| 733 break; |
| 734 |
| 735 |
| 736 case doBadOpenParenType: |
| 737 case doRuleError: |
| 738 error(U_REGEX_RULE_SYNTAX); |
| 739 break; |
| 740 |
| 741 |
| 742 case doMismatchedParenErr: |
| 743 error(U_REGEX_MISMATCHED_PAREN); |
| 744 break; |
| 745 |
| 746 case doPlus: |
| 747 // Normal '+' compiles to |
| 748 // 1. stuff to be repeated (already built) |
| 749 // 2. jmp-sav 1 |
| 750 // 3. ... |
| 751 // |
| 752 // Or, if the item to be repeated can match a zero length string, |
| 753 // 1. STO_INP_LOC data-loc |
| 754 // 2. body of stuff to be repeated |
| 755 // 3. JMP_SAV_X 2 |
| 756 // 4. ... |
| 757 |
| 758 // |
| 759 // Or, if the item to be repeated is simple |
| 760 // 1. Item to be repeated. |
| 761 // 2. LOOP_SR_I set number (assuming repeated item is a set re
f) |
| 762 // 3. LOOP_C stack location |
| 763 { |
| 764 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 |
| 765 int32_t frameLoc; |
| 766 |
| 767 // Check for simple constructs, which may get special optimized code
. |
| 768 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
| 769 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
| 770 |
| 771 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 772 // Emit optimized code for [char set]+ |
| 773 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); |
| 774 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
| 775 frameLoc = fRXPat->fFrameSize; |
| 776 fRXPat->fFrameSize++; |
| 777 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
| 778 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 779 break; |
| 780 } |
| 781 |
| 782 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
| 783 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 784 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 785 // Emit Optimized code for .+ operations. |
| 786 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
| 787 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 788 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. |
| 789 loopOpI |= 1; |
| 790 } |
| 791 if (fModeFlags & UREGEX_UNIX_LINES) { |
| 792 loopOpI |= 2; |
| 793 } |
| 794 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
| 795 frameLoc = fRXPat->fFrameSize; |
| 796 fRXPat->fFrameSize++; |
| 797 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
| 798 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 799 break; |
| 800 } |
| 801 |
| 802 } |
| 803 |
| 804 // General case. |
| 805 |
| 806 // Check for minimum match length of zero, which requires |
| 807 // extra loop-breaking code. |
| 808 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { |
| 809 // Zero length match is possible. |
| 810 // Emit the code sequence that can handle it. |
| 811 insertOp(topLoc); |
| 812 frameLoc = fRXPat->fFrameSize; |
| 813 fRXPat->fFrameSize++; |
| 814 |
| 815 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); |
| 816 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 817 |
| 818 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); |
| 819 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 820 } else { |
| 821 // Simpler code when the repeated body must match something non-
empty |
| 822 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); |
| 823 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| 824 } |
| 825 } |
| 826 break; |
| 827 |
| 828 case doNGPlus: |
| 829 // Non-greedy '+?' compiles to |
| 830 // 1. stuff to be repeated (already built) |
| 831 // 2. state-save 1 |
| 832 // 3. ... |
| 833 { |
| 834 int32_t topLoc = blockTopLoc(FALSE); |
| 835 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); |
| 836 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); |
| 837 } |
| 838 break; |
| 839 |
| 840 |
| 841 case doOpt: |
| 842 // Normal (greedy) ? quantifier. |
| 843 // Compiles to |
| 844 // 1. state save 3 |
| 845 // 2. body of optional block |
| 846 // 3. ... |
| 847 // Insert the state save into the compiled pattern, and we're done. |
| 848 { |
| 849 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 850 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiled
Pat->size()); |
| 851 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 852 } |
| 853 break; |
| 854 |
| 855 case doNGOpt: |
| 856 // Non-greedy ?? quantifier |
| 857 // compiles to |
| 858 // 1. jmp 4 |
| 859 // 2. body of optional block |
| 860 // 3 jmp 5 |
| 861 // 4. state save 2 |
| 862 // 5 ... |
| 863 // This code is less than ideal, with two jmps instead of one, because
we can only |
| 864 // insert one instruction at the top of the block being iterated. |
| 865 { |
| 866 int32_t jmp1_loc = blockTopLoc(TRUE); |
| 867 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
| 868 |
| 869 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); |
| 870 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
| 871 |
| 872 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); |
| 873 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); |
| 874 |
| 875 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); |
| 876 fRXPat->fCompiledPat->addElement(save_op, *fStatus); |
| 877 } |
| 878 break; |
| 879 |
| 880 |
| 881 case doStar: |
| 882 // Normal (greedy) * quantifier. |
| 883 // Compiles to |
| 884 // 1. STATE_SAVE 4 |
| 885 // 2. body of stuff being iterated over |
| 886 // 3. JMP_SAV 2 |
| 887 // 4. ... |
| 888 // |
| 889 // Or, if the body is a simple [Set], |
| 890 // 1. LOOP_SR_I set number |
| 891 // 2. LOOP_C stack location |
| 892 // ... |
| 893 // |
| 894 // Or if this is a .* |
| 895 // 1. LOOP_DOT_I (. matches all mode flag) |
| 896 // 2. LOOP_C stack location |
| 897 // |
| 898 // Or, if the body can match a zero-length string, to inhibit infinite l
oops, |
| 899 // 1. STATE_SAVE 5 |
| 900 // 2. STO_INP_LOC data-loc |
| 901 // 3. body of stuff |
| 902 // 4. JMP_SAV_X 2 |
| 903 // 5. ... |
| 904 { |
| 905 // location of item #1, the STATE_SAVE |
| 906 int32_t topLoc = blockTopLoc(FALSE); |
| 907 int32_t dataLoc = -1; |
| 908 |
| 909 // Check for simple *, where the construct being repeated |
| 910 // compiled to single opcode, and might be optimizable. |
| 911 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
| 912 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
| 913 |
| 914 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 915 // Emit optimized code for a [char set]* |
| 916 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); |
| 917 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 918 dataLoc = fRXPat->fFrameSize; |
| 919 fRXPat->fFrameSize++; |
| 920 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
| 921 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 922 break; |
| 923 } |
| 924 |
| 925 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
| 926 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 927 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 928 // Emit Optimized code for .* operations. |
| 929 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
| 930 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 931 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. |
| 932 loopOpI |= 1; |
| 933 } |
| 934 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { |
| 935 loopOpI |= 2; |
| 936 } |
| 937 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 938 dataLoc = fRXPat->fFrameSize; |
| 939 fRXPat->fFrameSize++; |
| 940 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
| 941 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 942 break; |
| 943 } |
| 944 } |
| 945 |
| 946 // Emit general case code for this * |
| 947 // The optimizations did not apply. |
| 948 |
| 949 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 950 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); |
| 951 |
| 952 // Check for minimum match length of zero, which requires |
| 953 // extra loop-breaking code. |
| 954 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { |
| 955 insertOp(saveStateLoc); |
| 956 dataLoc = fRXPat->fFrameSize; |
| 957 fRXPat->fFrameSize++; |
| 958 |
| 959 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); |
| 960 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); |
| 961 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); |
| 962 } |
| 963 |
| 964 // Locate the position in the compiled pattern where the match will
continue |
| 965 // after completing the *. (4 or 5 in the comment above) |
| 966 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 967 |
| 968 // Put together the save state op store it into the compiled code. |
| 969 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| 970 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 971 |
| 972 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. |
| 973 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| 974 } |
| 975 break; |
| 976 |
| 977 case doNGStar: |
| 978 // Non-greedy *? quantifier |
| 979 // compiles to |
| 980 // 1. JMP 3 |
| 981 // 2. body of stuff being iterated over |
| 982 // 3. STATE_SAVE 2 |
| 983 // 4 ... |
| 984 { |
| 985 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. |
| 986 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. |
| 987 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); |
| 988 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); |
| 989 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
| 990 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); |
| 991 } |
| 992 break; |
| 993 |
| 994 |
| 995 case doIntervalInit: |
| 996 // The '{' opening an interval quantifier was just scanned. |
| 997 // Init the counter varaiables that will accumulate the values as the di
gits |
| 998 // are scanned. |
| 999 fIntervalLow = 0; |
| 1000 fIntervalUpper = -1; |
| 1001 break; |
| 1002 |
| 1003 case doIntevalLowerDigit: |
| 1004 // Scanned a digit from the lower value of an {lower,upper} interval |
| 1005 { |
| 1006 int32_t digitValue = u_charDigitValue(fC.fChar); |
| 1007 U_ASSERT(digitValue >= 0); |
| 1008 fIntervalLow = fIntervalLow*10 + digitValue; |
| 1009 if (fIntervalLow < 0) { |
| 1010 error(U_REGEX_NUMBER_TOO_BIG); |
| 1011 } |
| 1012 } |
| 1013 break; |
| 1014 |
| 1015 case doIntervalUpperDigit: |
| 1016 // Scanned a digit from the upper value of an {lower,upper} interval |
| 1017 { |
| 1018 if (fIntervalUpper < 0) { |
| 1019 fIntervalUpper = 0; |
| 1020 } |
| 1021 int32_t digitValue = u_charDigitValue(fC.fChar); |
| 1022 U_ASSERT(digitValue >= 0); |
| 1023 fIntervalUpper = fIntervalUpper*10 + digitValue; |
| 1024 if (fIntervalUpper < 0) { |
| 1025 error(U_REGEX_NUMBER_TOO_BIG); |
| 1026 } |
| 1027 } |
| 1028 break; |
| 1029 |
| 1030 case doIntervalSame: |
| 1031 // Scanned a single value interval like {27}. Upper = Lower. |
| 1032 fIntervalUpper = fIntervalLow; |
| 1033 break; |
| 1034 |
| 1035 case doInterval: |
| 1036 // Finished scanning a normal {lower,upper} interval. Generate the code
for it. |
| 1037 if (compileInlineInterval() == FALSE) { |
| 1038 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); |
| 1039 } |
| 1040 break; |
| 1041 |
| 1042 case doPossessiveInterval: |
| 1043 // Finished scanning a Possessive {lower,upper}+ interval. Generate the
code for it. |
| 1044 { |
| 1045 // Remember the loc for the top of the block being looped over. |
| 1046 // (Can not reserve a slot in the compiled pattern at this time, b
ecause |
| 1047 // compileInterval needs to reserve also, and blockTopLoc can onl
y reserve |
| 1048 // once per block.) |
| 1049 int32_t topLoc = blockTopLoc(FALSE); |
| 1050 |
| 1051 // Produce normal looping code. |
| 1052 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); |
| 1053 |
| 1054 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP |
| 1055 // just as if the loop was inclosed in atomic parentheses. |
| 1056 |
| 1057 // First the STO_SP before the start of the loop |
| 1058 insertOp(topLoc); |
| 1059 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the |
| 1060 fRXPat->fDataSize += 1; // state stack ptr. |
| 1061 int32_t op = URX_BUILD(URX_STO_SP, varLoc); |
| 1062 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1063 |
| 1064 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); |
| 1065 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); |
| 1066 loopOp++; // point LoopOp after the just-inserted STO_SP |
| 1067 fRXPat->fCompiledPat->push(loopOp, *fStatus); |
| 1068 |
| 1069 // Then the LD_SP after the end of the loop |
| 1070 op = URX_BUILD(URX_LD_SP, varLoc); |
| 1071 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1072 } |
| 1073 |
| 1074 break; |
| 1075 |
| 1076 case doNGInterval: |
| 1077 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. |
| 1078 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); |
| 1079 break; |
| 1080 |
| 1081 case doIntervalError: |
| 1082 error(U_REGEX_BAD_INTERVAL); |
| 1083 break; |
| 1084 |
| 1085 case doLiteralChar: |
| 1086 // We've just scanned a "normal" character from the pattern, |
| 1087 literalChar(fC.fChar); |
| 1088 break; |
| 1089 |
| 1090 |
| 1091 case doEscapedLiteralChar: |
| 1092 // We've just scanned an backslashed escaped character with no |
| 1093 // special meaning. It represents itself. |
| 1094 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && |
| 1095 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] |
| 1096 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] |
| 1097 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 1098 } |
| 1099 literalChar(fC.fChar); |
| 1100 break; |
| 1101 |
| 1102 |
| 1103 case doDotAny: |
| 1104 // scanned a ".", match any single character. |
| 1105 { |
| 1106 int32_t op; |
| 1107 if (fModeFlags & UREGEX_DOTALL) { |
| 1108 op = URX_BUILD(URX_DOTANY_ALL, 0); |
| 1109 } else if (fModeFlags & UREGEX_UNIX_LINES) { |
| 1110 op = URX_BUILD(URX_DOTANY_UNIX, 0); |
| 1111 } else { |
| 1112 op = URX_BUILD(URX_DOTANY, 0); |
| 1113 } |
| 1114 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1115 } |
| 1116 break; |
| 1117 |
| 1118 case doCaret: |
| 1119 { |
| 1120 int32_t op = 0; |
| 1121 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1122 op = URX_CARET; |
| 1123 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1124 op = URX_CARET_M; |
| 1125 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1126 op = URX_CARET; // Only testing true start of input. |
| 1127 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1128 op = URX_CARET_M_UNIX; |
| 1129 } |
| 1130 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 1131 } |
| 1132 break; |
| 1133 |
| 1134 case doDollar: |
| 1135 { |
| 1136 int32_t op = 0; |
| 1137 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1138 op = URX_DOLLAR; |
| 1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1140 op = URX_DOLLAR_M; |
| 1141 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1142 op = URX_DOLLAR_D; |
| 1143 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1144 op = URX_DOLLAR_MD; |
| 1145 } |
| 1146 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 1147 } |
| 1148 break; |
| 1149 |
| 1150 case doBackslashA: |
| 1151 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); |
| 1152 break; |
| 1153 |
| 1154 case doBackslashB: |
| 1155 { |
| 1156 #if UCONFIG_NO_BREAK_ITERATION==1 |
| 1157 if (fModeFlags & UREGEX_UWORD) { |
| 1158 error(U_UNSUPPORTED_ERROR); |
| 1159 } |
| 1160 #endif |
| 1161 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
| 1162 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); |
| 1163 } |
| 1164 break; |
| 1165 |
| 1166 case doBackslashb: |
| 1167 { |
| 1168 #if UCONFIG_NO_BREAK_ITERATION==1 |
| 1169 if (fModeFlags & UREGEX_UWORD) { |
| 1170 error(U_UNSUPPORTED_ERROR); |
| 1171 } |
| 1172 #endif |
| 1173 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
| 1174 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 1175 } |
| 1176 break; |
| 1177 |
| 1178 case doBackslashD: |
| 1179 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus
); |
| 1180 break; |
| 1181 |
| 1182 case doBackslashd: |
| 1183 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus
); |
| 1184 break; |
| 1185 |
| 1186 case doBackslashG: |
| 1187 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus
); |
| 1188 break; |
| 1189 |
| 1190 case doBackslashS: |
| 1191 fRXPat->fCompiledPat->addElement( |
| 1192 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); |
| 1193 break; |
| 1194 |
| 1195 case doBackslashs: |
| 1196 fRXPat->fCompiledPat->addElement( |
| 1197 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); |
| 1198 break; |
| 1199 |
| 1200 case doBackslashW: |
| 1201 fRXPat->fCompiledPat->addElement( |
| 1202 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); |
| 1203 break; |
| 1204 |
| 1205 case doBackslashw: |
| 1206 fRXPat->fCompiledPat->addElement( |
| 1207 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); |
| 1208 break; |
| 1209 |
| 1210 case doBackslashX: |
| 1211 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus
); |
| 1212 break; |
| 1213 |
| 1214 |
| 1215 case doBackslashZ: |
| 1216 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); |
| 1217 break; |
| 1218 |
| 1219 case doBackslashz: |
| 1220 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus
); |
| 1221 break; |
| 1222 |
| 1223 case doEscapeError: |
| 1224 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 1225 break; |
| 1226 |
| 1227 case doExit: |
| 1228 returnVal = FALSE; |
| 1229 break; |
| 1230 |
| 1231 case doProperty: |
| 1232 { |
| 1233 UnicodeSet *theSet = scanProp(); |
| 1234 compileSet(theSet); |
| 1235 } |
| 1236 break; |
| 1237 |
| 1238 case doNamedChar: |
| 1239 { |
| 1240 UChar32 c = scanNamedChar(); |
| 1241 literalChar(c); |
| 1242 } |
| 1243 break; |
| 1244 |
| 1245 |
| 1246 case doBackRef: |
| 1247 // BackReference. Somewhat unusual in that the front-end can not comple
tely parse |
| 1248 // the regular expression, because the number of digits
to be consumed |
| 1249 // depends on the number of capture groups that have bee
n defined. So |
| 1250 // we have to do it here instead. |
| 1251 { |
| 1252 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); |
| 1253 int32_t groupNum = 0; |
| 1254 UChar32 c = fC.fChar; |
| 1255 |
| 1256 for (;;) { |
| 1257 // Loop once per digit, for max allowed number of digits in a ba
ck reference. |
| 1258 int32_t digit = u_charDigitValue(c); |
| 1259 groupNum = groupNum * 10 + digit; |
| 1260 if (groupNum >= numCaptureGroups) { |
| 1261 break; |
| 1262 } |
| 1263 c = peekCharLL(); |
| 1264 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c)
== FALSE) { |
| 1265 break; |
| 1266 } |
| 1267 nextCharLL(); |
| 1268 } |
| 1269 |
| 1270 // Scan of the back reference in the source regexp is complete. Now
generate |
| 1271 // the compiled code for it. |
| 1272 // Because capture groups can be forward-referenced by back-referenc
es, |
| 1273 // we fill the operand with the capture group number. At the end |
| 1274 // of compilation, it will be changed to the variable's location. |
| 1275 U_ASSERT(groupNum > 0); |
| 1276 int32_t op; |
| 1277 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1278 op = URX_BUILD(URX_BACKREF_I, groupNum); |
| 1279 } else { |
| 1280 op = URX_BUILD(URX_BACKREF, groupNum); |
| 1281 } |
| 1282 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1283 } |
| 1284 break; |
| 1285 |
| 1286 |
| 1287 case doPossessivePlus: |
| 1288 // Possessive ++ quantifier. |
| 1289 // Compiles to |
| 1290 // 1. STO_SP |
| 1291 // 2. body of stuff being iterated over |
| 1292 // 3. STATE_SAVE 5 |
| 1293 // 4. JMP 2 |
| 1294 // 5. LD_SP |
| 1295 // 6. ... |
| 1296 // |
| 1297 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up |
| 1298 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 |
| 1299 // |
| 1300 { |
| 1301 // Emit the STO_SP |
| 1302 int32_t topLoc = blockTopLoc(TRUE); |
| 1303 int32_t stoLoc = fRXPat->fDataSize; |
| 1304 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 1305 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 1306 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1307 |
| 1308 // Emit the STATE_SAVE |
| 1309 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
| 1310 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1311 |
| 1312 // Emit the JMP |
| 1313 op = URX_BUILD(URX_JMP, topLoc+1); |
| 1314 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1315 |
| 1316 // Emit the LD_SP |
| 1317 op = URX_BUILD(URX_LD_SP, stoLoc); |
| 1318 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1319 } |
| 1320 break; |
| 1321 |
| 1322 case doPossessiveStar: |
| 1323 // Possessive *+ quantifier. |
| 1324 // Compiles to |
| 1325 // 1. STO_SP loc |
| 1326 // 2. STATE_SAVE 5 |
| 1327 // 3. body of stuff being iterated over |
| 1328 // 4. JMP 2 |
| 1329 // 5. LD_SP loc |
| 1330 // 6 ... |
| 1331 // TODO: do something to cut back the state stack each time through the
loop. |
| 1332 { |
| 1333 // Reserve two slots at the top of the block. |
| 1334 int32_t topLoc = blockTopLoc(TRUE); |
| 1335 insertOp(topLoc); |
| 1336 |
| 1337 // emit STO_SP loc |
| 1338 int32_t stoLoc = fRXPat->fDataSize; |
| 1339 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 1340 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 1341 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1342 |
| 1343 // Emit the SAVE_STATE 5 |
| 1344 int32_t L7 = fRXPat->fCompiledPat->size()+1; |
| 1345 op = URX_BUILD(URX_STATE_SAVE, L7); |
| 1346 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 1347 |
| 1348 // Append the JMP operation. |
| 1349 op = URX_BUILD(URX_JMP, topLoc+1); |
| 1350 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1351 |
| 1352 // Emit the LD_SP loc |
| 1353 op = URX_BUILD(URX_LD_SP, stoLoc); |
| 1354 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1355 } |
| 1356 break; |
| 1357 |
| 1358 case doPossessiveOpt: |
| 1359 // Possessive ?+ quantifier. |
| 1360 // Compiles to |
| 1361 // 1. STO_SP loc |
| 1362 // 2. SAVE_STATE 5 |
| 1363 // 3. body of optional block |
| 1364 // 4. LD_SP loc |
| 1365 // 5. ... |
| 1366 // |
| 1367 { |
| 1368 // Reserve two slots at the top of the block. |
| 1369 int32_t topLoc = blockTopLoc(TRUE); |
| 1370 insertOp(topLoc); |
| 1371 |
| 1372 // Emit the STO_SP |
| 1373 int32_t stoLoc = fRXPat->fDataSize; |
| 1374 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 1375 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 1376 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1377 |
| 1378 // Emit the SAVE_STATE |
| 1379 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 1380 op = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| 1381 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 1382 |
| 1383 // Emit the LD_SP |
| 1384 op = URX_BUILD(URX_LD_SP, stoLoc); |
| 1385 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1386 } |
| 1387 break; |
| 1388 |
| 1389 |
| 1390 case doBeginMatchMode: |
| 1391 fNewModeFlags = fModeFlags; |
| 1392 fSetModeFlag = TRUE; |
| 1393 break; |
| 1394 |
| 1395 case doMatchMode: // (?i) and similar |
| 1396 { |
| 1397 int32_t bit = 0; |
| 1398 switch (fC.fChar) { |
| 1399 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; |
| 1400 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; |
| 1401 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; |
| 1402 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; |
| 1403 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; |
| 1404 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; |
| 1405 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; |
| 1406 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; |
| 1407 default: |
| 1408 U_ASSERT(FALSE); // Should never happen. Other chars are filt
ered out |
| 1409 // by the scanner. |
| 1410 } |
| 1411 if (fSetModeFlag) { |
| 1412 fNewModeFlags |= bit; |
| 1413 } else { |
| 1414 fNewModeFlags &= ~bit; |
| 1415 } |
| 1416 } |
| 1417 break; |
| 1418 |
| 1419 case doSetMatchMode: |
| 1420 // We've got a (?i) or similar. The match mode is being changed, but |
| 1421 // the change is not scoped to a parenthesized block. |
| 1422 U_ASSERT(fNewModeFlags < 0); |
| 1423 fModeFlags = fNewModeFlags; |
| 1424 |
| 1425 // Prevent any string from spanning across the change of match mode. |
| 1426 // Otherwise the pattern "abc(?i)def" would make a single string of "a
bcdef" |
| 1427 fixLiterals(); |
| 1428 break; |
| 1429 |
| 1430 |
| 1431 case doMatchModeParen: |
| 1432 // We've got a (?i: or similar. Begin a parenthesized block, save old |
| 1433 // mode flags so they can be restored at the close of the block. |
| 1434 // |
| 1435 // Compile to a |
| 1436 // - NOP, which later may be replaced by a save-state if the |
| 1437 // parenthesized group gets a * quantifier, followed by |
| 1438 // - NOP, which may later be replaced by a save-state if there |
| 1439 // is an '|' alternation within the parens. |
| 1440 { |
| 1441 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 1442 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 1443 |
| 1444 // On the Parentheses stack, start a new frame and add the postions |
| 1445 // of the two NOPs (a normal non-capturing () frame, except for th
e |
| 1446 // saving of the orignal mode flags.) |
| 1447 fParenStack.push(fModeFlags, *fStatus); |
| 1448 fParenStack.push(flags, *fStatus); // Fra
me Marker |
| 1449 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP |
| 1450 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
| 1451 |
| 1452 // Set the current mode flags to the new values. |
| 1453 U_ASSERT(fNewModeFlags < 0); |
| 1454 fModeFlags = fNewModeFlags; |
| 1455 } |
| 1456 break; |
| 1457 |
| 1458 case doBadModeFlag: |
| 1459 error(U_REGEX_INVALID_FLAG); |
| 1460 break; |
| 1461 |
| 1462 case doSuppressComments: |
| 1463 // We have just scanned a '(?'. We now need to prevent the character sc
anner from |
| 1464 // treating a '#' as a to-the-end-of-line comment. |
| 1465 // (This Perl compatibility just gets uglier and uglier to do...) |
| 1466 fEOLComments = FALSE; |
| 1467 break; |
| 1468 |
| 1469 |
| 1470 case doSetAddAmp: |
| 1471 { |
| 1472 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1473 set->add(chAmp); |
| 1474 } |
| 1475 break; |
| 1476 |
| 1477 case doSetAddDash: |
| 1478 { |
| 1479 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1480 set->add(chDash); |
| 1481 } |
| 1482 break; |
| 1483 |
| 1484 case doSetBackslash_s: |
| 1485 { |
| 1486 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1487 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); |
| 1488 break; |
| 1489 } |
| 1490 |
| 1491 case doSetBackslash_S: |
| 1492 { |
| 1493 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1494 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE
_SET]); |
| 1495 SSet.complement(); |
| 1496 set->addAll(SSet); |
| 1497 break; |
| 1498 } |
| 1499 |
| 1500 case doSetBackslash_d: |
| 1501 { |
| 1502 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1503 // TODO - make a static set, ticket 6058. |
| 1504 addCategory(set, U_GC_ND_MASK, *fStatus); |
| 1505 break; |
| 1506 } |
| 1507 |
| 1508 case doSetBackslash_D: |
| 1509 { |
| 1510 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1511 UnicodeSet digits; |
| 1512 // TODO - make a static set, ticket 6058. |
| 1513 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA
SK, *fStatus); |
| 1514 digits.complement(); |
| 1515 set->addAll(digits); |
| 1516 break; |
| 1517 } |
| 1518 |
| 1519 case doSetBackslash_w: |
| 1520 { |
| 1521 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1522 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]
); |
| 1523 break; |
| 1524 } |
| 1525 |
| 1526 case doSetBackslash_W: |
| 1527 { |
| 1528 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); |
| 1529 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_
SET]); |
| 1530 SSet.complement(); |
| 1531 set->addAll(SSet); |
| 1532 break; |
| 1533 } |
| 1534 |
| 1535 case doSetBegin: |
| 1536 fSetStack.push(new UnicodeSet(), *fStatus); |
| 1537 fSetOpStack.push(setStart, *fStatus); |
| 1538 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { |
| 1539 fSetOpStack.push(setCaseClose, *fStatus); |
| 1540 } |
| 1541 break; |
| 1542 |
| 1543 case doSetBeginDifference1: |
| 1544 // We have scanned something like [[abc]-[ |
| 1545 // Set up a new UnicodeSet for the set beginning with the just-scanned
'[' |
| 1546 // Push a Difference operator, which will cause the new set to be subtr
acted from what |
| 1547 // went before once it is created. |
| 1548 setPushOp(setDifference1); |
| 1549 fSetOpStack.push(setStart, *fStatus); |
| 1550 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { |
| 1551 fSetOpStack.push(setCaseClose, *fStatus); |
| 1552 } |
| 1553 break; |
| 1554 |
| 1555 case doSetBeginIntersection1: |
| 1556 // We have scanned something like [[abc]&[ |
| 1557 // Need both the '&' operator and the open '[' operator. |
| 1558 setPushOp(setIntersection1); |
| 1559 fSetOpStack.push(setStart, *fStatus); |
| 1560 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { |
| 1561 fSetOpStack.push(setCaseClose, *fStatus); |
| 1562 } |
| 1563 break; |
| 1564 |
| 1565 case doSetBeginUnion: |
| 1566 // We have scanned something like [[abc][ |
| 1567 // Need to handle the union operation explicitly [[abc] | [ |
| 1568 setPushOp(setUnion); |
| 1569 fSetOpStack.push(setStart, *fStatus); |
| 1570 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { |
| 1571 fSetOpStack.push(setCaseClose, *fStatus); |
| 1572 } |
| 1573 break; |
| 1574 |
| 1575 case doSetDifference2: |
| 1576 // We have scanned something like [abc-- |
| 1577 // Consider this to unambiguously be a set difference operator. |
| 1578 setPushOp(setDifference2); |
| 1579 break; |
| 1580 |
| 1581 case doSetEnd: |
| 1582 // Have encountered the ']' that closes a set. |
| 1583 // Force the evaluation of any pending operations within this set, |
| 1584 // leave the completed set on the top of the set stack. |
| 1585 setEval(setEnd); |
| 1586 U_ASSERT(fSetOpStack.peeki()==setStart); |
| 1587 fSetOpStack.popi(); |
| 1588 break; |
| 1589 |
| 1590 case doSetFinish: |
| 1591 { |
| 1592 // Finished a complete set expression, including all nested sets. |
| 1593 // The close bracket has already triggered clearing out pending set op
erators, |
| 1594 // the operator stack should be empty and the operand stack should ha
ve just |
| 1595 // one entry, the result set. |
| 1596 U_ASSERT(fSetOpStack.empty()); |
| 1597 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); |
| 1598 U_ASSERT(fSetStack.empty()); |
| 1599 compileSet(theSet); |
| 1600 break; |
| 1601 } |
| 1602 |
| 1603 case doSetIntersection2: |
| 1604 // Have scanned something like [abc&& |
| 1605 setPushOp(setIntersection2); |
| 1606 break; |
| 1607 |
| 1608 case doSetLiteral: |
| 1609 // Union the just-scanned literal character into the set being built. |
| 1610 // This operation is the highest precedence set operation, so we can
always do |
| 1611 // it immediately, without waiting to see what follows. It is necess
ary to perform |
| 1612 // any pending '-' or '&' operation first, because these have the sam
e precedence |
| 1613 // as union-ing in a literal' |
| 1614 { |
| 1615 setEval(setUnion); |
| 1616 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1617 s->add(fC.fChar); |
| 1618 fLastSetLiteral = fC.fChar; |
| 1619 break; |
| 1620 } |
| 1621 |
| 1622 case doSetLiteralEscaped: |
| 1623 // A back-slash escaped literal character was encountered. |
| 1624 // Processing is the same as with setLiteral, above, with the addition o
f |
| 1625 // the optional check for errors on escaped ASCII letters. |
| 1626 { |
| 1627 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && |
| 1628 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] |
| 1629 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] |
| 1630 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 1631 } |
| 1632 setEval(setUnion); |
| 1633 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1634 s->add(fC.fChar); |
| 1635 fLastSetLiteral = fC.fChar; |
| 1636 break; |
| 1637 } |
| 1638 |
| 1639 case doSetNamedChar: |
| 1640 // Scanning a \N{UNICODE CHARACTER NAME} |
| 1641 // Aside from the source of the character, the processing is identical
to doSetLiteral, |
| 1642 // above. |
| 1643 { |
| 1644 UChar32 c = scanNamedChar(); |
| 1645 setEval(setUnion); |
| 1646 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1647 s->add(c); |
| 1648 fLastSetLiteral = c; |
| 1649 break; |
| 1650 } |
| 1651 |
| 1652 case doSetNamedRange: |
| 1653 // We have scanned literal-\N{CHAR NAME}. Add the range to the set. |
| 1654 // The left character is already in the set, and is saved in fLastSetLit
eral. |
| 1655 // The right side needs to be picked up, the scan is at the 'N'. |
| 1656 // Lower Limit > Upper limit being an error matches both Java |
| 1657 // and ICU UnicodeSet behavior. |
| 1658 { |
| 1659 UChar32 c = scanNamedChar(); |
| 1660 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { |
| 1661 error(U_REGEX_INVALID_RANGE); |
| 1662 } |
| 1663 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1664 s->add(fLastSetLiteral, c); |
| 1665 fLastSetLiteral = c; |
| 1666 break; |
| 1667 } |
| 1668 |
| 1669 |
| 1670 case doSetNegate: |
| 1671 // Scanned a '^' at the start of a set. |
| 1672 // Push the negation operator onto the set op stack. |
| 1673 // A twist for case-insensitive matching: |
| 1674 // the case closure operation must happen _before_ negation. |
| 1675 // But the case closure operation will already be on the stack if it's
required. |
| 1676 // This requires checking for case closure, and swapping the stack ord
er |
| 1677 // if it is present. |
| 1678 { |
| 1679 int32_t tosOp = fSetOpStack.peeki(); |
| 1680 if (tosOp == setCaseClose) { |
| 1681 fSetOpStack.popi(); |
| 1682 fSetOpStack.push(setNegation, *fStatus); |
| 1683 fSetOpStack.push(setCaseClose, *fStatus); |
| 1684 } else { |
| 1685 fSetOpStack.push(setNegation, *fStatus); |
| 1686 } |
| 1687 } |
| 1688 break; |
| 1689 |
| 1690 case doSetNoCloseError: |
| 1691 error(U_REGEX_MISSING_CLOSE_BRACKET); |
| 1692 break; |
| 1693 |
| 1694 case doSetOpError: |
| 1695 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal
. |
| 1696 break; |
| 1697 |
| 1698 case doSetPosixProp: |
| 1699 { |
| 1700 UnicodeSet *s = scanPosixProp(); |
| 1701 if (s != NULL) { |
| 1702 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); |
| 1703 tos->addAll(*s); |
| 1704 delete s; |
| 1705 } // else error. scanProp() reported the error status already. |
| 1706 } |
| 1707 break; |
| 1708 |
| 1709 case doSetProp: |
| 1710 // Scanned a \p \P within [brackets]. |
| 1711 { |
| 1712 UnicodeSet *s = scanProp(); |
| 1713 if (s != NULL) { |
| 1714 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); |
| 1715 tos->addAll(*s); |
| 1716 delete s; |
| 1717 } // else error. scanProp() reported the error status already. |
| 1718 } |
| 1719 break; |
| 1720 |
| 1721 |
| 1722 case doSetRange: |
| 1723 // We have scanned literal-literal. Add the range to the set. |
| 1724 // The left character is already in the set, and is saved in fLastSetLit
eral. |
| 1725 // The right side is the current character. |
| 1726 // Lower Limit > Upper limit being an error matches both Java |
| 1727 // and ICU UnicodeSet behavior. |
| 1728 { |
| 1729 if (fLastSetLiteral > fC.fChar) { |
| 1730 error(U_REGEX_INVALID_RANGE); |
| 1731 } |
| 1732 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1733 s->add(fLastSetLiteral, fC.fChar); |
| 1734 break; |
| 1735 } |
| 1736 |
| 1737 |
| 1738 default: |
| 1739 U_ASSERT(FALSE); |
| 1740 error(U_REGEX_INTERNAL_ERROR); |
| 1741 break; |
| 1742 } |
| 1743 |
| 1744 if (U_FAILURE(*fStatus)) { |
| 1745 returnVal = FALSE; |
| 1746 } |
| 1747 |
| 1748 return returnVal; |
| 1749 } |
| 1750 |
| 1751 |
| 1752 |
| 1753 //------------------------------------------------------------------------------ |
| 1754 // |
| 1755 // literalChar We've encountered a literal character from the patter
n, |
| 1756 // or an escape sequence that reduces to a character
. |
| 1757 // Add it to the string containing all literal chars/str
ings from |
| 1758 // the pattern. |
| 1759 // If we are in a pattern string already, add the new ch
ar to it. |
| 1760 // If we aren't in a pattern string, begin one now. |
| 1761 // |
| 1762 //------------------------------------------------------------------------------ |
| 1763 void RegexCompile::literalChar(UChar32 c) { |
| 1764 int32_t op; // An operation in the compiled pattern. |
| 1765 int32_t opType; |
| 1766 int32_t patternLoc; // A position in the compiled pattern. |
| 1767 int32_t stringLen; |
| 1768 |
| 1769 |
| 1770 // If the last thing compiled into the pattern was not a literal char, |
| 1771 // force this new literal char to begin a new string, and not append to th
e previous. |
| 1772 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); |
| 1773 opType = URX_TYPE(op); |
| 1774 if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONE
CHAR_I)) { |
| 1775 fixLiterals(); |
| 1776 } |
| 1777 |
| 1778 if (fStringOpStart == -1) { |
| 1779 // First char of a string in the pattern. |
| 1780 // Emit a OneChar op into the compiled pattern. |
| 1781 emitONE_CHAR(c); |
| 1782 |
| 1783 // Mark that we might actually be starting a string here |
| 1784 fStringOpStart = fRXPat->fLiteralText.length(); |
| 1785 return; |
| 1786 } |
| 1787 |
| 1788 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); |
| 1789 opType = URX_TYPE(op); |
| 1790 U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_S
TRING_LEN); |
| 1791 |
| 1792 // If the most recently emitted op is a URX_ONECHAR, |
| 1793 if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { |
| 1794 if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { |
| 1795 // The most recently emitted op is a ONECHAR that was the first half |
| 1796 // of a surrogate pair. Update the ONECHAR's operand to be the |
| 1797 // supplementary code point resulting from both halves of the pair
. |
| 1798 c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c); |
| 1799 op = URX_BUILD(opType, c); |
| 1800 patternLoc = fRXPat->fCompiledPat->size() - 1; |
| 1801 fRXPat->fCompiledPat->setElementAt(op, patternLoc); |
| 1802 return; |
| 1803 } |
| 1804 |
| 1805 // The most recently emitted op is a ONECHAR. |
| 1806 // We've now received another adjacent char. Change the ONECHAR op |
| 1807 // to a string op. |
| 1808 fRXPat->fLiteralText.append(URX_VAL(op)); |
| 1809 |
| 1810 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1811 op = URX_BUILD(URX_STRING_I, fStringOpStart); |
| 1812 } else { |
| 1813 op = URX_BUILD(URX_STRING, fStringOpStart); |
| 1814 } |
| 1815 patternLoc = fRXPat->fCompiledPat->size() - 1; |
| 1816 fRXPat->fCompiledPat->setElementAt(op, patternLoc); |
| 1817 op = URX_BUILD(URX_STRING_LEN, 0); |
| 1818 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1819 } |
| 1820 |
| 1821 // We are adding onto an existing string |
| 1822 fRXPat->fLiteralText.append(c); |
| 1823 |
| 1824 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the |
| 1825 // string length to reflect the new char we just added to the string. |
| 1826 stringLen = fRXPat->fLiteralText.length() - fStringOpStart; |
| 1827 op = URX_BUILD(URX_STRING_LEN, stringLen); |
| 1828 patternLoc = fRXPat->fCompiledPat->size() - 1; |
| 1829 fRXPat->fCompiledPat->setElementAt(op, patternLoc); |
| 1830 } |
| 1831 |
| 1832 |
| 1833 |
| 1834 //------------------------------------------------------------------------------ |
| 1835 // |
| 1836 // emitONE_CHAR emit a ONE_CHAR op into the generated code. |
| 1837 // Choose cased or uncased version, depending on the |
| 1838 // match mode and whether the character itself is cased. |
| 1839 // |
| 1840 //------------------------------------------------------------------------------ |
| 1841 void RegexCompile::emitONE_CHAR(UChar32 c) { |
| 1842 int32_t op; |
| 1843 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && |
| 1844 u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 1845 // We have a cased character, and are in case insensitive matching mode. |
| 1846 //c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs
() now |
| 1847 op = URX_BUILD(URX_ONECHAR_I, c); |
| 1848 } else { |
| 1849 // Uncased char, or case sensitive match mode. |
| 1850 // Either way, just generate a literal compare of the char. |
| 1851 op = URX_BUILD(URX_ONECHAR, c); |
| 1852 } |
| 1853 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1854 } |
| 1855 |
| 1856 |
| 1857 //------------------------------------------------------------------------------ |
| 1858 // |
| 1859 // fixLiterals When compiling something that can follow a literal |
| 1860 // string in a pattern, we need to "fix" any preceding |
| 1861 // string, which will cause any subsequent literals to |
| 1862 // begin a new string, rather than appending to the |
| 1863 // old one. |
| 1864 // |
| 1865 // Optionally, split the last char of the string off in
to |
| 1866 // a single "ONE_CHAR" operation, so that quantifiers c
an |
| 1867 // apply to that char alone. Example: abc* |
| 1868 // The * must apply to the 'c' only. |
| 1869 // |
| 1870 //------------------------------------------------------------------------------ |
| 1871 void RegexCompile::fixLiterals(UBool split) { |
| 1872 int32_t stringStart = fStringOpStart; // start index of the current lite
ral string |
| 1873 int32_t op; // An op from/for the compiled pat
tern. |
| 1874 int32_t opType; // An opcode type from the compile
d pattern. |
| 1875 int32_t stringLastCharIdx; |
| 1876 UChar32 lastChar; |
| 1877 int32_t stringNextToLastCharIdx; |
| 1878 UChar32 nextToLastChar; |
| 1879 int32_t stringLen; |
| 1880 |
| 1881 fStringOpStart = -1; |
| 1882 if (!split) { |
| 1883 return; |
| 1884 } |
| 1885 |
| 1886 // Split: We need to ensure that the last item in the compiled pattern doe
s |
| 1887 // not refer to a literal string of more than one char. If it does, |
| 1888 // separate the last char from the rest of the string. |
| 1889 |
| 1890 // If the last operation from the compiled pattern is not a string, |
| 1891 // nothing needs to be done |
| 1892 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); |
| 1893 opType = URX_TYPE(op); |
| 1894 if (opType != URX_STRING_LEN) { |
| 1895 return; |
| 1896 } |
| 1897 stringLen = URX_VAL(op); |
| 1898 |
| 1899 // |
| 1900 // Find the position of the last code point in the string (might be a surro
gate pair) |
| 1901 // |
| 1902 stringLastCharIdx = fRXPat->fLiteralText.length(); |
| 1903 stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); |
| 1904 lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx); |
| 1905 |
| 1906 // The string should always be at least two code points long, meaning that t
here |
| 1907 // should be something before the last char position that we just found. |
| 1908 U_ASSERT(stringLastCharIdx > stringStart); |
| 1909 stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx
, -1); |
| 1910 U_ASSERT(stringNextToLastCharIdx >= stringStart); |
| 1911 nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastChar
Idx); |
| 1912 |
| 1913 if (stringNextToLastCharIdx > stringStart) { |
| 1914 // The length of string remaining after removing one char is two or more
. |
| 1915 // Leave the string in the compiled pattern, shorten it by one char, |
| 1916 // and append a URX_ONECHAR op for the last char. |
| 1917 stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx); |
| 1918 op = URX_BUILD(URX_STRING_LEN, stringLen); |
| 1919 fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); |
| 1920 emitONE_CHAR(lastChar); |
| 1921 } else { |
| 1922 // The original string consisted of exactly two characters. Replace |
| 1923 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair |
| 1924 // of URX_ONECHARs. |
| 1925 fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2); |
| 1926 emitONE_CHAR(nextToLastChar); |
| 1927 emitONE_CHAR(lastChar); |
| 1928 } |
| 1929 } |
| 1930 |
| 1931 |
| 1932 |
| 1933 |
| 1934 |
| 1935 |
| 1936 //------------------------------------------------------------------------------ |
| 1937 // |
| 1938 // insertOp() Insert a slot for a new opcode into the already |
| 1939 // compiled pattern code. |
| 1940 // |
| 1941 // Fill the slot with a NOP. Our caller will replace i
t |
| 1942 // with what they really wanted. |
| 1943 // |
| 1944 //------------------------------------------------------------------------------ |
| 1945 void RegexCompile::insertOp(int32_t where) { |
| 1946 UVector64 *code = fRXPat->fCompiledPat; |
| 1947 U_ASSERT(where>0 && where < code->size()); |
| 1948 |
| 1949 int32_t nop = URX_BUILD(URX_NOP, 0); |
| 1950 code->insertElementAt(nop, where, *fStatus); |
| 1951 |
| 1952 // Walk through the pattern, looking for any ops with targets that |
| 1953 // were moved down by the insert. Fix them. |
| 1954 int32_t loc; |
| 1955 for (loc=0; loc<code->size(); loc++) { |
| 1956 int32_t op = (int32_t)code->elementAti(loc); |
| 1957 int32_t opType = URX_TYPE(op); |
| 1958 int32_t opValue = URX_VAL(op); |
| 1959 if ((opType == URX_JMP || |
| 1960 opType == URX_JMPX || |
| 1961 opType == URX_STATE_SAVE || |
| 1962 opType == URX_CTR_LOOP || |
| 1963 opType == URX_CTR_LOOP_NG || |
| 1964 opType == URX_JMP_SAV || |
| 1965 opType == URX_RELOC_OPRND) && opValue > where) { |
| 1966 // Target location for this opcode is after the insertion point and |
| 1967 // needs to be incremented to adjust for the insertion. |
| 1968 opValue++; |
| 1969 op = URX_BUILD(opType, opValue); |
| 1970 code->setElementAt(op, loc); |
| 1971 } |
| 1972 } |
| 1973 |
| 1974 // Now fix up the parentheses stack. All positive values in it are location
s in |
| 1975 // the compiled pattern. (Negative values are frame boundaries, and don't
need fixing.) |
| 1976 for (loc=0; loc<fParenStack.size(); loc++) { |
| 1977 int32_t x = fParenStack.elementAti(loc); |
| 1978 U_ASSERT(x < code->size()); |
| 1979 if (x>where) { |
| 1980 x++; |
| 1981 fParenStack.setElementAt(x, loc); |
| 1982 } |
| 1983 } |
| 1984 |
| 1985 if (fMatchCloseParen > where) { |
| 1986 fMatchCloseParen++; |
| 1987 } |
| 1988 if (fMatchOpenParen > where) { |
| 1989 fMatchOpenParen++; |
| 1990 } |
| 1991 } |
| 1992 |
| 1993 |
| 1994 |
| 1995 //------------------------------------------------------------------------------ |
| 1996 // |
| 1997 // blockTopLoc() Find or create a location in the compiled pattern |
| 1998 // at the start of the operation or block that has |
| 1999 // just been compiled. Needed when a quantifier (* or |
| 2000 // whatever) appears, and we need to add an operation |
| 2001 // at the start of the thing being quantified. |
| 2002 // |
| 2003 // (Parenthesized Blocks) have a slot with a NOP that |
| 2004 // is reserved for this purpose. .* or similar don't |
| 2005 // and a slot needs to be added. |
| 2006 // |
| 2007 // parameter reserveLoc : TRUE - ensure that there is space to add an
opcode |
| 2008 // at the returned location. |
| 2009 // FALSE - just return the address, |
| 2010 // do not reserve a location there. |
| 2011 // |
| 2012 //------------------------------------------------------------------------------ |
| 2013 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { |
| 2014 int32_t theLoc; |
| 2015 if (fRXPat->fCompiledPat->size() == fMatchCloseParen) |
| 2016 { |
| 2017 // The item just processed is a parenthesized block. |
| 2018 theLoc = fMatchOpenParen; // A slot is already reserved for us. |
| 2019 U_ASSERT(theLoc > 0); |
| 2020 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc)))
== URX_NOP); |
| 2021 } |
| 2022 else { |
| 2023 // Item just compiled is a single thing, a ".", or a single char, or a s
et reference. |
| 2024 // No slot for STATE_SAVE was pre-reserved in the compiled code. |
| 2025 // We need to make space now. |
| 2026 fixLiterals(TRUE); // If last item was a string, separate the last char
. |
| 2027 theLoc = fRXPat->fCompiledPat->size()-1; |
| 2028 if (reserveLoc) { |
| 2029 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/ |
| 2030 int32_t nop = URX_BUILD(URX_NOP, 0); |
| 2031 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); |
| 2032 } |
| 2033 } |
| 2034 return theLoc; |
| 2035 } |
| 2036 |
| 2037 |
| 2038 |
| 2039 //------------------------------------------------------------------------------ |
| 2040 // |
| 2041 // handleCloseParen When compiling a close paren, we need to go back |
| 2042 // and fix up any JMP or SAVE operations within the |
| 2043 // parenthesized block that need to target the end |
| 2044 // of the block. The locations of these are kept on |
| 2045 // the paretheses stack. |
| 2046 // |
| 2047 // This function is called both when encountering a |
| 2048 // real ) and at the end of the pattern. |
| 2049 // |
| 2050 //------------------------------------------------------------------------------ |
| 2051 void RegexCompile::handleCloseParen() { |
| 2052 int32_t patIdx; |
| 2053 int32_t patOp; |
| 2054 if (fParenStack.size() <= 0) { |
| 2055 error(U_REGEX_MISMATCHED_PAREN); |
| 2056 return; |
| 2057 } |
| 2058 |
| 2059 // Force any literal chars that may follow the close paren to start a new st
ring, |
| 2060 // and not attach to any preceding it. |
| 2061 fixLiterals(FALSE); |
| 2062 |
| 2063 // Fixup any operations within the just-closed parenthesized group |
| 2064 // that need to reference the end of the (block). |
| 2065 // (The first one popped from the stack is an unused slot for |
| 2066 // alternation (OR) state save, but applying the fixup to it does no har
m.) |
| 2067 for (;;) { |
| 2068 patIdx = fParenStack.popi(); |
| 2069 if (patIdx < 0) { |
| 2070 // value < 0 flags the start of the frame on the paren stack. |
| 2071 break; |
| 2072 } |
| 2073 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); |
| 2074 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); |
| 2075 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should
not be set. |
| 2076 patOp |= fRXPat->fCompiledPat->size(); // Set it now. |
| 2077 fRXPat->fCompiledPat->setElementAt(patOp, patIdx); |
| 2078 fMatchOpenParen = patIdx; |
| 2079 } |
| 2080 |
| 2081 // At the close of any parenthesized block, restore the match mode flags t
o |
| 2082 // the value they had at the open paren. Saved value is |
| 2083 // at the top of the paren stack. |
| 2084 fModeFlags = fParenStack.popi(); |
| 2085 U_ASSERT(fModeFlags < 0); |
| 2086 |
| 2087 // DO any additional fixups, depending on the specific kind of |
| 2088 // parentesized grouping this is |
| 2089 |
| 2090 switch (patIdx) { |
| 2091 case plain: |
| 2092 case flags: |
| 2093 // No additional fixups required. |
| 2094 // (Grouping-only parentheses) |
| 2095 break; |
| 2096 case capturing: |
| 2097 // Capturing Parentheses. |
| 2098 // Insert a End Capture op into the pattern. |
| 2099 // The frame offset of the variables for this cg is obtained from the |
| 2100 // start capture op and put it into the end-capture op. |
| 2101 { |
| 2102 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); |
| 2103 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
| 2104 |
| 2105 int32_t frameVarLocation = URX_VAL(captureOp); |
| 2106 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); |
| 2107 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); |
| 2108 } |
| 2109 break; |
| 2110 case atomic: |
| 2111 // Atomic Parenthesis. |
| 2112 // Insert a LD_SP operation to restore the state stack to the position |
| 2113 // it was when the atomic parens were entered. |
| 2114 { |
| 2115 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); |
| 2116 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); |
| 2117 int32_t stoLoc = URX_VAL(stoOp); |
| 2118 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); |
| 2119 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); |
| 2120 } |
| 2121 break; |
| 2122 |
| 2123 case lookAhead: |
| 2124 { |
| 2125 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
| 2126 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 2127 int32_t dataLoc = URX_VAL(startOp); |
| 2128 int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
| 2129 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2130 } |
| 2131 break; |
| 2132 |
| 2133 case negLookAhead: |
| 2134 { |
| 2135 // See comment at doOpenLookAheadNeg |
| 2136 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); |
| 2137 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 2138 int32_t dataLoc = URX_VAL(startOp); |
| 2139 int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
| 2140 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2141 op = URX_BUILD(URX_BACKTRACK, 0); |
| 2142 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2143 op = URX_BUILD(URX_LA_END, dataLoc); |
| 2144 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2145 |
| 2146 // Patch the URX_SAVE near the top of the block. |
| 2147 // The destination of the SAVE is the final LA_END that was just add
ed. |
| 2148 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); |
| 2149 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); |
| 2150 int32_t dest = fRXPat->fCompiledPat->size()-1; |
| 2151 saveOp = URX_BUILD(URX_STATE_SAVE, dest); |
| 2152 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); |
| 2153 } |
| 2154 break; |
| 2155 |
| 2156 case lookBehind: |
| 2157 { |
| 2158 // See comment at doOpenLookBehind. |
| 2159 |
| 2160 // Append the URX_LB_END and URX_LA_END to the compiled pattern. |
| 2161 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); |
| 2162 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 2163 int32_t dataLoc = URX_VAL(startOp); |
| 2164 int32_t op = URX_BUILD(URX_LB_END, dataLoc); |
| 2165 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2166 op = URX_BUILD(URX_LA_END, dataLoc); |
| 2167 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2168 |
| 2169 // Determine the min and max bounds for the length of the |
| 2170 // string that the pattern can match. |
| 2171 // An unbounded upper limit is an error. |
| 2172 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
| 2173 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
| 2174 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2175 if (maxML == INT32_MAX) { |
| 2176 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2177 break; |
| 2178 } |
| 2179 U_ASSERT(minML <= maxML); |
| 2180 |
| 2181 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
| 2182 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
| 2183 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); |
| 2184 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); |
| 2185 |
| 2186 } |
| 2187 break; |
| 2188 |
| 2189 |
| 2190 |
| 2191 case lookBehindN: |
| 2192 { |
| 2193 // See comment at doOpenLookBehindNeg. |
| 2194 |
| 2195 // Append the URX_LBN_END to the compiled pattern. |
| 2196 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
| 2197 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 2198 int32_t dataLoc = URX_VAL(startOp); |
| 2199 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); |
| 2200 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2201 |
| 2202 // Determine the min and max bounds for the length of the |
| 2203 // string that the pattern can match. |
| 2204 // An unbounded upper limit is an error. |
| 2205 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
| 2206 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
| 2207 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2208 if (maxML == INT32_MAX) { |
| 2209 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2210 break; |
| 2211 } |
| 2212 U_ASSERT(minML <= maxML); |
| 2213 |
| 2214 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
| 2215 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
| 2216 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); |
| 2217 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); |
| 2218 |
| 2219 // Insert the pattern location to continue at after a successful mat
ch |
| 2220 // as the last operand of the URX_LBN_CONT |
| 2221 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); |
| 2222 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); |
| 2223 } |
| 2224 break; |
| 2225 |
| 2226 |
| 2227 |
| 2228 default: |
| 2229 U_ASSERT(FALSE); |
| 2230 } |
| 2231 |
| 2232 // remember the next location in the compiled pattern. |
| 2233 // The compilation of Quantifiers will look at this to see whether its loopi
ng |
| 2234 // over a parenthesized block or a single item |
| 2235 fMatchCloseParen = fRXPat->fCompiledPat->size(); |
| 2236 } |
| 2237 |
| 2238 |
| 2239 |
| 2240 //------------------------------------------------------------------------------ |
| 2241 // |
| 2242 // compileSet Compile the pattern operations for a reference to a |
| 2243 // UnicodeSet. |
| 2244 // |
| 2245 //------------------------------------------------------------------------------ |
| 2246 void RegexCompile::compileSet(UnicodeSet *theSet) |
| 2247 { |
| 2248 if (theSet == NULL) { |
| 2249 return; |
| 2250 } |
| 2251 // Remove any strings from the set. |
| 2252 // There shoudn't be any, but just in case. |
| 2253 // (Case Closure can add them; if we had a simple case closure avaialble
that |
| 2254 // ignored strings, that would be better.) |
| 2255 theSet->removeAllStrings(); |
| 2256 int32_t setSize = theSet->size(); |
| 2257 |
| 2258 switch (setSize) { |
| 2259 case 0: |
| 2260 { |
| 2261 // Set of no elements. Always fails to match. |
| 2262 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStat
us); |
| 2263 delete theSet; |
| 2264 } |
| 2265 break; |
| 2266 |
| 2267 case 1: |
| 2268 { |
| 2269 // The set contains only a single code point. Put it into |
| 2270 // the compiled pattern as a single char operation rather |
| 2271 // than a set, and discard the set itself. |
| 2272 literalChar(theSet->charAt(0)); |
| 2273 delete theSet; |
| 2274 } |
| 2275 break; |
| 2276 |
| 2277 default: |
| 2278 { |
| 2279 // The set contains two or more chars. (the normal case) |
| 2280 // Put it into the compiled pattern as a set. |
| 2281 int32_t setNumber = fRXPat->fSets->size(); |
| 2282 fRXPat->fSets->addElement(theSet, *fStatus); |
| 2283 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); |
| 2284 fRXPat->fCompiledPat->addElement(setOp, *fStatus); |
| 2285 } |
| 2286 } |
| 2287 } |
| 2288 |
| 2289 |
| 2290 //------------------------------------------------------------------------------ |
| 2291 // |
| 2292 // compileInterval Generate the code for a {min, max} style interval quanti
fier. |
| 2293 // Except for the specific opcodes used, the code is the sa
me |
| 2294 // for all three types (greedy, non-greedy, possessive) of |
| 2295 // intervals. The opcodes are supplied as parameters. |
| 2296 // |
| 2297 // The code for interval loops has this form: |
| 2298 // 0 CTR_INIT counter loc (in stack frame) |
| 2299 // 1 5 patt address of CTR_LOOP at bottom o
f block |
| 2300 // 2 min count |
| 2301 // 3 max count (-1 for unbounded) |
| 2302 // 4 ... block to be iterated over |
| 2303 // 5 CTR_LOOP |
| 2304 // |
| 2305 // In |
| 2306 //------------------------------------------------------------------------------ |
| 2307 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) |
| 2308 { |
| 2309 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes |
| 2310 // four slots in the compiled code. Reserve them. |
| 2311 int32_t topOfBlock = blockTopLoc(TRUE); |
| 2312 insertOp(topOfBlock); |
| 2313 insertOp(topOfBlock); |
| 2314 insertOp(topOfBlock); |
| 2315 |
| 2316 // The operands for the CTR_INIT opcode include the index in the matcher dat
a |
| 2317 // of the counter. Allocate it now. |
| 2318 int32_t counterLoc = fRXPat->fFrameSize; |
| 2319 fRXPat->fFrameSize++; |
| 2320 |
| 2321 int32_t op = URX_BUILD(InitOp, counterLoc); |
| 2322 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); |
| 2323 |
| 2324 // The second operand of CTR_INIT is the location following the end of the l
oop. |
| 2325 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the |
| 2326 // compilation of something later on causes the code to grow and the targe
t |
| 2327 // position to move. |
| 2328 int32_t loopEnd = fRXPat->fCompiledPat->size(); |
| 2329 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); |
| 2330 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); |
| 2331 |
| 2332 // Followed by the min and max counts. |
| 2333 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); |
| 2334 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); |
| 2335 |
| 2336 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. |
| 2337 // Goes at end of the block being looped over, so just append to the code
so far. |
| 2338 op = URX_BUILD(LoopOp, topOfBlock); |
| 2339 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2340 |
| 2341 if ((fIntervalLow & 0xff000000) != 0 || |
| 2342 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { |
| 2343 error(U_REGEX_NUMBER_TOO_BIG); |
| 2344 } |
| 2345 |
| 2346 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { |
| 2347 error(U_REGEX_MAX_LT_MIN); |
| 2348 } |
| 2349 } |
| 2350 |
| 2351 |
| 2352 |
| 2353 UBool RegexCompile::compileInlineInterval() { |
| 2354 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { |
| 2355 // Too big to inline. Fail, which will cause looping code to be generat
ed. |
| 2356 // (Upper < Lower picks up unbounded upper and errors, both.) |
| 2357 return FALSE; |
| 2358 } |
| 2359 |
| 2360 int32_t topOfBlock = blockTopLoc(FALSE); |
| 2361 if (fIntervalUpper == 0) { |
| 2362 // Pathological case. Attempt no matches, as if the block doesn't exist
. |
| 2363 fRXPat->fCompiledPat->setSize(topOfBlock); |
| 2364 return TRUE; |
| 2365 } |
| 2366 |
| 2367 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { |
| 2368 // The thing being repeated is not a single op, but some |
| 2369 // more complex block. Do it as a loop, not inlines. |
| 2370 // Note that things "repeated" a max of once are handled as inline, be
cause |
| 2371 // the one copy of the code already generated is just fine. |
| 2372 return FALSE; |
| 2373 } |
| 2374 |
| 2375 // Pick up the opcode that is to be repeated |
| 2376 // |
| 2377 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); |
| 2378 |
| 2379 // Compute the pattern location where the inline sequence |
| 2380 // will end, and set up the state save op that will be needed. |
| 2381 // |
| 2382 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 |
| 2383 + fIntervalUpper + (fIntervalUpper-fIntervalLow)
; |
| 2384 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); |
| 2385 if (fIntervalLow == 0) { |
| 2386 insertOp(topOfBlock); |
| 2387 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); |
| 2388 } |
| 2389 |
| 2390 |
| 2391 |
| 2392 // Loop, emitting the op for the thing being repeated each time. |
| 2393 // Loop starts at 1 because one instance of the op already exists in the
pattern, |
| 2394 // it was put there when it was originally encountered. |
| 2395 int32_t i; |
| 2396 for (i=1; i<fIntervalUpper; i++ ) { |
| 2397 if (i == fIntervalLow) { |
| 2398 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
| 2399 } |
| 2400 if (i > fIntervalLow) { |
| 2401 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
| 2402 } |
| 2403 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2404 } |
| 2405 return TRUE; |
| 2406 } |
| 2407 |
| 2408 |
| 2409 |
| 2410 //------------------------------------------------------------------------------ |
| 2411 // |
| 2412 // matchStartType Determine how a match can start. |
| 2413 // Used to optimize find() operations. |
| 2414 // |
| 2415 // Operation is very similar to minMatchLength(). Walk the
compiled |
| 2416 // pattern, keeping an on-going minimum-match-length. For a
ny |
| 2417 // op where the min match coming in is zero, add that ops po
ssible |
| 2418 // starting matches to the possible starts for the overall p
attern. |
| 2419 // |
| 2420 //------------------------------------------------------------------------------ |
| 2421 void RegexCompile::matchStartType() { |
| 2422 if (U_FAILURE(*fStatus)) { |
| 2423 return; |
| 2424 } |
| 2425 |
| 2426 |
| 2427 int32_t loc; // Location in the pattern of the current
op being processed. |
| 2428 int32_t op; // The op being processed |
| 2429 int32_t opType; // The opcode type of the op |
| 2430 int32_t currentLen = 0; // Minimum length of a match to this poin
t (loc) in the pattern |
| 2431 int32_t numInitialStrings = 0; // Number of strings encountered that cou
ld match at start. |
| 2432 |
| 2433 UBool atStart = TRUE; // True if no part of the pattern yet enc
ountered |
| 2434 // could have advanced the position in
a match. |
| 2435 // (Maximum match length so far == 0) |
| 2436 |
| 2437 // forwardedLength is a vector holding minimum-match-length values that |
| 2438 // are propagated forward in the pattern by JMP or STATE_SAVE operations. |
| 2439 // It must be one longer than the pattern being checked because some ops |
| 2440 // will jmp to a end-of-block+1 location from within a block, and we must |
| 2441 // count those when checking the block. |
| 2442 int32_t end = fRXPat->fCompiledPat->size(); |
| 2443 UVector32 forwardedLength(end+1, *fStatus); |
| 2444 forwardedLength.setSize(end+1); |
| 2445 for (loc=3; loc<end; loc++) { |
| 2446 forwardedLength.setElementAt(INT32_MAX, loc); |
| 2447 } |
| 2448 |
| 2449 for (loc = 3; loc<end; loc++) { |
| 2450 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 2451 opType = URX_TYPE(op); |
| 2452 |
| 2453 // The loop is advancing linearly through the pattern. |
| 2454 // If the op we are now at was the destination of a branch in the patter
n, |
| 2455 // and that path has a shorter minimum length than the current accumulat
ed value, |
| 2456 // replace the current accumulated value. |
| 2457 if (forwardedLength.elementAti(loc) < currentLen) { |
| 2458 currentLen = forwardedLength.elementAti(loc); |
| 2459 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); |
| 2460 } |
| 2461 |
| 2462 switch (opType) { |
| 2463 // Ops that don't change the total length matched |
| 2464 case URX_RESERVED_OP: |
| 2465 case URX_END: |
| 2466 case URX_FAIL: |
| 2467 case URX_STRING_LEN: |
| 2468 case URX_NOP: |
| 2469 case URX_START_CAPTURE: |
| 2470 case URX_END_CAPTURE: |
| 2471 case URX_BACKSLASH_B: |
| 2472 case URX_BACKSLASH_BU: |
| 2473 case URX_BACKSLASH_G: |
| 2474 case URX_BACKSLASH_Z: |
| 2475 case URX_DOLLAR: |
| 2476 case URX_DOLLAR_M: |
| 2477 case URX_DOLLAR_D: |
| 2478 case URX_DOLLAR_MD: |
| 2479 case URX_RELOC_OPRND: |
| 2480 case URX_STO_INP_LOC: |
| 2481 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match |
| 2482 case URX_BACKREF_I: |
| 2483 |
| 2484 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. |
| 2485 case URX_LD_SP: |
| 2486 break; |
| 2487 |
| 2488 case URX_CARET: |
| 2489 if (atStart) { |
| 2490 fRXPat->fStartType = START_START; |
| 2491 } |
| 2492 break; |
| 2493 |
| 2494 case URX_CARET_M: |
| 2495 case URX_CARET_M_UNIX: |
| 2496 if (atStart) { |
| 2497 fRXPat->fStartType = START_LINE; |
| 2498 } |
| 2499 break; |
| 2500 |
| 2501 case URX_ONECHAR: |
| 2502 if (currentLen == 0) { |
| 2503 // This character could appear at the start of a match. |
| 2504 // Add it to the set of possible starting characters. |
| 2505 fRXPat->fInitialChars->add(URX_VAL(op)); |
| 2506 numInitialStrings += 2; |
| 2507 } |
| 2508 currentLen++; |
| 2509 atStart = FALSE; |
| 2510 break; |
| 2511 |
| 2512 |
| 2513 case URX_SETREF: |
| 2514 if (currentLen == 0) { |
| 2515 int32_t sn = URX_VAL(op); |
| 2516 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); |
| 2517 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn)
; |
| 2518 fRXPat->fInitialChars->addAll(*s); |
| 2519 numInitialStrings += 2; |
| 2520 } |
| 2521 currentLen++; |
| 2522 atStart = FALSE; |
| 2523 break; |
| 2524 |
| 2525 case URX_LOOP_SR_I: |
| 2526 // [Set]*, like a SETREF, above, in what it can match, |
| 2527 // but may not match at all, so currentLen is not incremented. |
| 2528 if (currentLen == 0) { |
| 2529 int32_t sn = URX_VAL(op); |
| 2530 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); |
| 2531 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn)
; |
| 2532 fRXPat->fInitialChars->addAll(*s); |
| 2533 numInitialStrings += 2; |
| 2534 } |
| 2535 atStart = FALSE; |
| 2536 break; |
| 2537 |
| 2538 case URX_LOOP_DOT_I: |
| 2539 if (currentLen == 0) { |
| 2540 // .* at the start of a pattern. |
| 2541 // Any character can begin the match. |
| 2542 fRXPat->fInitialChars->clear(); |
| 2543 fRXPat->fInitialChars->complement(); |
| 2544 numInitialStrings += 2; |
| 2545 } |
| 2546 atStart = FALSE; |
| 2547 break; |
| 2548 |
| 2549 |
| 2550 case URX_STATIC_SETREF: |
| 2551 if (currentLen == 0) { |
| 2552 int32_t sn = URX_VAL(op); |
| 2553 U_ASSERT(sn>0 && sn<URX_LAST_SET); |
| 2554 const UnicodeSet *s = fRXPat->fStaticSets[sn]; |
| 2555 fRXPat->fInitialChars->addAll(*s); |
| 2556 numInitialStrings += 2; |
| 2557 } |
| 2558 currentLen++; |
| 2559 atStart = FALSE; |
| 2560 break; |
| 2561 |
| 2562 |
| 2563 |
| 2564 case URX_STAT_SETREF_N: |
| 2565 if (currentLen == 0) { |
| 2566 int32_t sn = URX_VAL(op); |
| 2567 const UnicodeSet *s = fRXPat->fStaticSets[sn]; |
| 2568 UnicodeSet sc(*s); |
| 2569 sc.complement(); |
| 2570 fRXPat->fInitialChars->addAll(sc); |
| 2571 numInitialStrings += 2; |
| 2572 } |
| 2573 currentLen++; |
| 2574 atStart = FALSE; |
| 2575 break; |
| 2576 |
| 2577 |
| 2578 |
| 2579 case URX_BACKSLASH_D: |
| 2580 // Digit Char |
| 2581 if (currentLen == 0) { |
| 2582 UnicodeSet s; |
| 2583 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MA
SK, *fStatus); |
| 2584 if (URX_VAL(op) != 0) { |
| 2585 s.complement(); |
| 2586 } |
| 2587 fRXPat->fInitialChars->addAll(s); |
| 2588 numInitialStrings += 2; |
| 2589 } |
| 2590 currentLen++; |
| 2591 atStart = FALSE; |
| 2592 break; |
| 2593 |
| 2594 |
| 2595 case URX_ONECHAR_I: |
| 2596 // Case Insensitive Single Character. |
| 2597 if (currentLen == 0) { |
| 2598 UChar32 c = URX_VAL(op); |
| 2599 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 2600 // character may have distinct cased forms. Add all of them |
| 2601 // to the set of possible starting match chars. |
| 2602 UnicodeSet s(c, c); |
| 2603 s.closeOver(USET_CASE_INSENSITIVE); |
| 2604 fRXPat->fInitialChars->addAll(s); |
| 2605 } else { |
| 2606 // Char has no case variants. Just add it as-is to the |
| 2607 // set of possible starting chars. |
| 2608 fRXPat->fInitialChars->add(c); |
| 2609 } |
| 2610 numInitialStrings += 2; |
| 2611 } |
| 2612 currentLen++; |
| 2613 atStart = FALSE; |
| 2614 break; |
| 2615 |
| 2616 |
| 2617 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. |
| 2618 case URX_DOTANY_ALL: // . matches one or two. |
| 2619 case URX_DOTANY: |
| 2620 case URX_DOTANY_UNIX: |
| 2621 if (currentLen == 0) { |
| 2622 // These constructs are all bad news when they appear at the sta
rt |
| 2623 // of a match. Any character can begin the match. |
| 2624 fRXPat->fInitialChars->clear(); |
| 2625 fRXPat->fInitialChars->complement(); |
| 2626 numInitialStrings += 2; |
| 2627 } |
| 2628 currentLen++; |
| 2629 atStart = FALSE; |
| 2630 break; |
| 2631 |
| 2632 |
| 2633 case URX_JMPX: |
| 2634 loc++; // Except for extra operand on URX_JMPX, same as
URX_JMP. |
| 2635 case URX_JMP: |
| 2636 { |
| 2637 int32_t jmpDest = URX_VAL(op); |
| 2638 if (jmpDest < loc) { |
| 2639 // Loop of some kind. Can safely ignore, the worst that wil
l happen |
| 2640 // is that we understate the true minimum length |
| 2641 currentLen = forwardedLength.elementAti(loc+1); |
| 2642 |
| 2643 } else { |
| 2644 // Forward jump. Propagate the current min length to the ta
rget loc of the jump. |
| 2645 U_ASSERT(jmpDest <= end+1); |
| 2646 if (forwardedLength.elementAti(jmpDest) > currentLen) { |
| 2647 forwardedLength.setElementAt(currentLen, jmpDest); |
| 2648 } |
| 2649 } |
| 2650 } |
| 2651 atStart = FALSE; |
| 2652 break; |
| 2653 |
| 2654 case URX_JMP_SAV: |
| 2655 case URX_JMP_SAV_X: |
| 2656 // Combo of state save to the next loc, + jmp backwards. |
| 2657 // Net effect on min. length computation is nothing. |
| 2658 atStart = FALSE; |
| 2659 break; |
| 2660 |
| 2661 case URX_BACKTRACK: |
| 2662 // Fails are kind of like a branch, except that the min length was |
| 2663 // propagated already, by the state save. |
| 2664 currentLen = forwardedLength.elementAti(loc+1); |
| 2665 atStart = FALSE; |
| 2666 break; |
| 2667 |
| 2668 |
| 2669 case URX_STATE_SAVE: |
| 2670 { |
| 2671 // State Save, for forward jumps, propagate the current minimum. |
| 2672 // of the state save. |
| 2673 int32_t jmpDest = URX_VAL(op); |
| 2674 if (jmpDest > loc) { |
| 2675 if (currentLen < forwardedLength.elementAti(jmpDest)) { |
| 2676 forwardedLength.setElementAt(currentLen, jmpDest); |
| 2677 } |
| 2678 } |
| 2679 } |
| 2680 atStart = FALSE; |
| 2681 break; |
| 2682 |
| 2683 |
| 2684 |
| 2685 |
| 2686 case URX_STRING: |
| 2687 { |
| 2688 loc++; |
| 2689 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); |
| 2690 int32_t stringLen = URX_VAL(stringLenOp); |
| 2691 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); |
| 2692 U_ASSERT(stringLenOp >= 2); |
| 2693 if (currentLen == 0) { |
| 2694 // Add the starting character of this string to the set of p
ossible starting |
| 2695 // characters for this pattern. |
| 2696 int32_t stringStartIdx = URX_VAL(op); |
| 2697 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); |
| 2698 fRXPat->fInitialChars->add(c); |
| 2699 |
| 2700 // Remember this string. After the entire pattern has been
checked, |
| 2701 // if nothing else is identified that can start a match, we
'll use it. |
| 2702 numInitialStrings++; |
| 2703 fRXPat->fInitialStringIdx = stringStartIdx; |
| 2704 fRXPat->fInitialStringLen = stringLen; |
| 2705 } |
| 2706 |
| 2707 currentLen += stringLen; |
| 2708 atStart = FALSE; |
| 2709 } |
| 2710 break; |
| 2711 |
| 2712 case URX_STRING_I: |
| 2713 { |
| 2714 // Case-insensitive string. Unlike exact-match strings, we won'
t |
| 2715 // attempt a string search for possible match positions. But
we |
| 2716 // do update the set of possible starting characters. |
| 2717 loc++; |
| 2718 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); |
| 2719 int32_t stringLen = URX_VAL(stringLenOp); |
| 2720 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); |
| 2721 U_ASSERT(stringLenOp >= 2); |
| 2722 if (currentLen == 0) { |
| 2723 // Add the starting character of this string to the set of p
ossible starting |
| 2724 // characters for this pattern. |
| 2725 int32_t stringStartIdx = URX_VAL(op); |
| 2726 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); |
| 2727 UnicodeSet s(c, c); |
| 2728 s.closeOver(USET_CASE_INSENSITIVE); |
| 2729 fRXPat->fInitialChars->addAll(s); |
| 2730 numInitialStrings += 2; // Matching on an initial string no
t possible. |
| 2731 } |
| 2732 currentLen += stringLen; |
| 2733 atStart = FALSE; |
| 2734 } |
| 2735 break; |
| 2736 |
| 2737 case URX_CTR_INIT: |
| 2738 case URX_CTR_INIT_NG: |
| 2739 { |
| 2740 // Loop Init Ops. These don't change the min length, but they a
re 4 word ops |
| 2741 // so location must be updated accordingly. |
| 2742 // Loop Init Ops. |
| 2743 // If the min loop count == 0 |
| 2744 // move loc forwards to the end of the loop, skipping over
the body. |
| 2745 // If the min count is > 0, |
| 2746 // continue normal processing of the body of the loop. |
| 2747 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti
(loc+1); |
| 2748 loopEndLoc = URX_VAL(loopEndLoc); |
| 2749 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti
(loc+2); |
| 2750 if (minLoopCount == 0) { |
| 2751 // Min Loop Count of 0, treat like a forward branch and |
| 2752 // move the current minimum length up to the target |
| 2753 // (end of loop) location. |
| 2754 U_ASSERT(loopEndLoc <= end+1); |
| 2755 if (forwardedLength.elementAti(loopEndLoc) > currentLen) { |
| 2756 forwardedLength.setElementAt(currentLen, loopEndLoc); |
| 2757 } |
| 2758 } |
| 2759 loc+=3; // Skips over operands of CTR_INIT |
| 2760 } |
| 2761 atStart = FALSE; |
| 2762 break; |
| 2763 |
| 2764 |
| 2765 case URX_CTR_LOOP: |
| 2766 case URX_CTR_LOOP_NG: |
| 2767 // Loop ops. |
| 2768 // The jump is conditional, backwards only. |
| 2769 atStart = FALSE; |
| 2770 break; |
| 2771 |
| 2772 case URX_LOOP_C: |
| 2773 // More loop ops. These state-save to themselves. |
| 2774 // don't change the minimum match |
| 2775 atStart = FALSE; |
| 2776 break; |
| 2777 |
| 2778 |
| 2779 case URX_LA_START: |
| 2780 case URX_LB_START: |
| 2781 { |
| 2782 // Look-around. Scan forward until the matching look-ahead end, |
| 2783 // without processing the look-around block. This is overly p
essimistic. |
| 2784 |
| 2785 // Keep track of the nesting depth of look-around blocks. Boile
rplate code for |
| 2786 // lookahead contains two LA_END instructions, so count goes u
p by two |
| 2787 // for each LA_START. |
| 2788 int32_t depth = (opType == URX_LA_START? 2: 1); |
| 2789 for (;;) { |
| 2790 loc++; |
| 2791 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 2792 if (URX_TYPE(op) == URX_LA_START) { |
| 2793 depth+=2; |
| 2794 } |
| 2795 if (URX_TYPE(op) == URX_LB_START) { |
| 2796 depth++; |
| 2797 } |
| 2798 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END)
{ |
| 2799 depth--; |
| 2800 if (depth == 0) { |
| 2801 break; |
| 2802 } |
| 2803 } |
| 2804 if (URX_TYPE(op) == URX_STATE_SAVE) { |
| 2805 // Need this because neg lookahead blocks will FAIL to o
utside |
| 2806 // of the block. |
| 2807 int32_t jmpDest = URX_VAL(op); |
| 2808 if (jmpDest > loc) { |
| 2809 if (currentLen < forwardedLength.elementAti(jmpDest)
) { |
| 2810 forwardedLength.setElementAt(currentLen, jmpDest
); |
| 2811 } |
| 2812 } |
| 2813 } |
| 2814 U_ASSERT(loc <= end); |
| 2815 } |
| 2816 } |
| 2817 break; |
| 2818 |
| 2819 case URX_LA_END: |
| 2820 case URX_LB_CONT: |
| 2821 case URX_LB_END: |
| 2822 case URX_LBN_CONT: |
| 2823 case URX_LBN_END: |
| 2824 U_ASSERT(FALSE); // Shouldn't get here. These ops should be |
| 2825 // consumed by the scan in URX_LA_START and LB
_START |
| 2826 |
| 2827 break; |
| 2828 |
| 2829 default: |
| 2830 U_ASSERT(FALSE); |
| 2831 } |
| 2832 |
| 2833 } |
| 2834 |
| 2835 |
| 2836 // We have finished walking through the ops. Check whether some forward jum
p |
| 2837 // propagated a shorter length to location end+1. |
| 2838 if (forwardedLength.elementAti(end+1) < currentLen) { |
| 2839 currentLen = forwardedLength.elementAti(end+1); |
| 2840 } |
| 2841 |
| 2842 |
| 2843 fRXPat->fInitialChars8->init(fRXPat->fInitialChars); |
| 2844 |
| 2845 |
| 2846 // Sort out what we should check for when looking for candidate match start
positions. |
| 2847 // In order of preference, |
| 2848 // 1. Start of input text buffer. |
| 2849 // 2. A literal string. |
| 2850 // 3. Start of line in multi-line mode. |
| 2851 // 4. A single literal character. |
| 2852 // 5. A character from a set of characters. |
| 2853 // |
| 2854 if (fRXPat->fStartType == START_START) { |
| 2855 // Match only at the start of an input text string. |
| 2856 // start type is already set. We're done. |
| 2857 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { |
| 2858 // Match beginning only with a literal string. |
| 2859 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); |
| 2860 U_ASSERT(fRXPat->fInitialChars->contains(c)); |
| 2861 fRXPat->fStartType = START_STRING; |
| 2862 fRXPat->fInitialChar = c; |
| 2863 } else if (fRXPat->fStartType == START_LINE) { |
| 2864 // Match at start of line in Multi-Line mode. |
| 2865 // Nothing to do here; everything is already set. |
| 2866 } else if (fRXPat->fMinMatchLen == 0) { |
| 2867 // Zero length match possible. We could start anywhere. |
| 2868 fRXPat->fStartType = START_NO_INFO; |
| 2869 } else if (fRXPat->fInitialChars->size() == 1) { |
| 2870 // All matches begin with the same char. |
| 2871 fRXPat->fStartType = START_CHAR; |
| 2872 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); |
| 2873 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); |
| 2874 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) ==
FALSE && |
| 2875 fRXPat->fMinMatchLen > 0) { |
| 2876 // Matches start with a set of character smaller than the set of all cha
rs. |
| 2877 fRXPat->fStartType = START_SET; |
| 2878 } else { |
| 2879 // Matches can start with anything |
| 2880 fRXPat->fStartType = START_NO_INFO; |
| 2881 } |
| 2882 |
| 2883 return; |
| 2884 } |
| 2885 |
| 2886 |
| 2887 |
| 2888 //------------------------------------------------------------------------------ |
| 2889 // |
| 2890 // minMatchLength Calculate the length of the shortest string that could |
| 2891 // match the specified pattern. |
| 2892 // Length is in 16 bit code units, not code points. |
| 2893 // |
| 2894 // The calculated length may not be exact. The returned |
| 2895 // value may be shorter than the actual minimum; it must |
| 2896 // never be longer. |
| 2897 // |
| 2898 // start and end are the range of p-code operations to be |
| 2899 // examined. The endpoints are included in the range. |
| 2900 // |
| 2901 //------------------------------------------------------------------------------ |
| 2902 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { |
| 2903 if (U_FAILURE(*fStatus)) { |
| 2904 return 0; |
| 2905 } |
| 2906 |
| 2907 U_ASSERT(start <= end); |
| 2908 U_ASSERT(end < fRXPat->fCompiledPat->size()); |
| 2909 |
| 2910 |
| 2911 int32_t loc; |
| 2912 int32_t op; |
| 2913 int32_t opType; |
| 2914 int32_t currentLen = 0; |
| 2915 |
| 2916 |
| 2917 // forwardedLength is a vector holding minimum-match-length values that |
| 2918 // are propagated forward in the pattern by JMP or STATE_SAVE operations. |
| 2919 // It must be one longer than the pattern being checked because some ops |
| 2920 // will jmp to a end-of-block+1 location from within a block, and we must |
| 2921 // count those when checking the block. |
| 2922 UVector32 forwardedLength(end+2, *fStatus); |
| 2923 forwardedLength.setSize(end+2); |
| 2924 for (loc=start; loc<=end+1; loc++) { |
| 2925 forwardedLength.setElementAt(INT32_MAX, loc); |
| 2926 } |
| 2927 |
| 2928 for (loc = start; loc<=end; loc++) { |
| 2929 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 2930 opType = URX_TYPE(op); |
| 2931 |
| 2932 // The loop is advancing linearly through the pattern. |
| 2933 // If the op we are now at was the destination of a branch in the patter
n, |
| 2934 // and that path has a shorter minimum length than the current accumulat
ed value, |
| 2935 // replace the current accumulated value. |
| 2936 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == I
NT32_MAX for some |
| 2937 // no-match-pos
sible cases. |
| 2938 if (forwardedLength.elementAti(loc) < currentLen) { |
| 2939 currentLen = forwardedLength.elementAti(loc); |
| 2940 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); |
| 2941 } |
| 2942 |
| 2943 switch (opType) { |
| 2944 // Ops that don't change the total length matched |
| 2945 case URX_RESERVED_OP: |
| 2946 case URX_END: |
| 2947 case URX_STRING_LEN: |
| 2948 case URX_NOP: |
| 2949 case URX_START_CAPTURE: |
| 2950 case URX_END_CAPTURE: |
| 2951 case URX_BACKSLASH_B: |
| 2952 case URX_BACKSLASH_BU: |
| 2953 case URX_BACKSLASH_G: |
| 2954 case URX_BACKSLASH_Z: |
| 2955 case URX_CARET: |
| 2956 case URX_DOLLAR: |
| 2957 case URX_DOLLAR_M: |
| 2958 case URX_DOLLAR_D: |
| 2959 case URX_DOLLAR_MD: |
| 2960 case URX_RELOC_OPRND: |
| 2961 case URX_STO_INP_LOC: |
| 2962 case URX_CARET_M: |
| 2963 case URX_CARET_M_UNIX: |
| 2964 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match |
| 2965 case URX_BACKREF_I: |
| 2966 |
| 2967 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. |
| 2968 case URX_LD_SP: |
| 2969 |
| 2970 case URX_JMP_SAV: |
| 2971 case URX_JMP_SAV_X: |
| 2972 break; |
| 2973 |
| 2974 |
| 2975 // Ops that match a minimum of one character (one or two 16 bit code
units.) |
| 2976 // |
| 2977 case URX_ONECHAR: |
| 2978 case URX_STATIC_SETREF: |
| 2979 case URX_STAT_SETREF_N: |
| 2980 case URX_SETREF: |
| 2981 case URX_BACKSLASH_D: |
| 2982 case URX_ONECHAR_I: |
| 2983 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. |
| 2984 case URX_DOTANY_ALL: // . matches one or two. |
| 2985 case URX_DOTANY: |
| 2986 case URX_DOTANY_UNIX: |
| 2987 currentLen++; |
| 2988 break; |
| 2989 |
| 2990 |
| 2991 case URX_JMPX: |
| 2992 loc++; // URX_JMPX has an extra operand, ignored here, |
| 2993 // otherwise processed identically to URX_JMP. |
| 2994 case URX_JMP: |
| 2995 { |
| 2996 int32_t jmpDest = URX_VAL(op); |
| 2997 if (jmpDest < loc) { |
| 2998 // Loop of some kind. Can safely ignore, the worst that wil
l happen |
| 2999 // is that we understate the true minimum length |
| 3000 currentLen = forwardedLength.elementAti(loc+1); |
| 3001 } else { |
| 3002 // Forward jump. Propagate the current min length to the ta
rget loc of the jump. |
| 3003 U_ASSERT(jmpDest <= end+1); |
| 3004 if (forwardedLength.elementAti(jmpDest) > currentLen) { |
| 3005 forwardedLength.setElementAt(currentLen, jmpDest); |
| 3006 } |
| 3007 } |
| 3008 } |
| 3009 break; |
| 3010 |
| 3011 case URX_BACKTRACK: |
| 3012 { |
| 3013 // Back-tracks are kind of like a branch, except that the min le
ngth was |
| 3014 // propagated already, by the state save. |
| 3015 currentLen = forwardedLength.elementAti(loc+1); |
| 3016 } |
| 3017 break; |
| 3018 |
| 3019 |
| 3020 case URX_STATE_SAVE: |
| 3021 { |
| 3022 // State Save, for forward jumps, propagate the current minimum. |
| 3023 // of the state save. |
| 3024 int32_t jmpDest = URX_VAL(op); |
| 3025 if (jmpDest > loc) { |
| 3026 if (currentLen < forwardedLength.elementAti(jmpDest)) { |
| 3027 forwardedLength.setElementAt(currentLen, jmpDest); |
| 3028 } |
| 3029 } |
| 3030 } |
| 3031 break; |
| 3032 |
| 3033 |
| 3034 case URX_STRING: |
| 3035 case URX_STRING_I: |
| 3036 { |
| 3037 loc++; |
| 3038 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); |
| 3039 currentLen += URX_VAL(stringLenOp); |
| 3040 } |
| 3041 break; |
| 3042 |
| 3043 |
| 3044 case URX_CTR_INIT: |
| 3045 case URX_CTR_INIT_NG: |
| 3046 { |
| 3047 // Loop Init Ops. |
| 3048 // If the min loop count == 0 |
| 3049 // move loc forwards to the end of the loop, skipping over
the body. |
| 3050 // If the min count is > 0, |
| 3051 // continue normal processing of the body of the loop. |
| 3052 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti
(loc+1); |
| 3053 loopEndLoc = URX_VAL(loopEndLoc); |
| 3054 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti
(loc+2); |
| 3055 if (minLoopCount == 0) { |
| 3056 loc = loopEndLoc; |
| 3057 } else { |
| 3058 loc+=3; // Skips over operands of CTR_INIT |
| 3059 } |
| 3060 } |
| 3061 break; |
| 3062 |
| 3063 |
| 3064 case URX_CTR_LOOP: |
| 3065 case URX_CTR_LOOP_NG: |
| 3066 // Loop ops. |
| 3067 // The jump is conditional, backwards only. |
| 3068 break; |
| 3069 |
| 3070 case URX_LOOP_SR_I: |
| 3071 case URX_LOOP_DOT_I: |
| 3072 case URX_LOOP_C: |
| 3073 // More loop ops. These state-save to themselves. |
| 3074 // don't change the minimum match - could match nothing at all. |
| 3075 break; |
| 3076 |
| 3077 |
| 3078 case URX_LA_START: |
| 3079 case URX_LB_START: |
| 3080 { |
| 3081 // Look-around. Scan forward until the matching look-ahead end, |
| 3082 // without processing the look-around block. This is overly p
essimistic for look-ahead, |
| 3083 // it assumes that the look-ahead match might be zero-length. |
| 3084 // TODO: Positive lookahead could recursively do the block, t
hen continue |
| 3085 // with the longer of the block or the value coming in.
Ticket 6060 |
| 3086 int32_t depth = (opType == URX_LA_START? 2: 1);; |
| 3087 for (;;) { |
| 3088 loc++; |
| 3089 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 3090 if (URX_TYPE(op) == URX_LA_START) { |
| 3091 // The boilerplate for look-ahead includes two LA_END in
sturctions, |
| 3092 // Depth will be decremented by each one when it is s
een. |
| 3093 depth += 2; |
| 3094 } |
| 3095 if (URX_TYPE(op) == URX_LB_START) { |
| 3096 depth++; |
| 3097 } |
| 3098 if (URX_TYPE(op) == URX_LA_END) { |
| 3099 depth--; |
| 3100 if (depth == 0) { |
| 3101 break; |
| 3102 } |
| 3103 } |
| 3104 if (URX_TYPE(op)==URX_LBN_END) { |
| 3105 depth--; |
| 3106 if (depth == 0) { |
| 3107 break; |
| 3108 } |
| 3109 } |
| 3110 if (URX_TYPE(op) == URX_STATE_SAVE) { |
| 3111 // Need this because neg lookahead blocks will FAIL to o
utside |
| 3112 // of the block. |
| 3113 int32_t jmpDest = URX_VAL(op); |
| 3114 if (jmpDest > loc) { |
| 3115 if (currentLen < forwardedLength.elementAti(jmpDest)
) { |
| 3116 forwardedLength.setElementAt(currentLen, jmpDest
); |
| 3117 } |
| 3118 } |
| 3119 } |
| 3120 U_ASSERT(loc <= end); |
| 3121 } |
| 3122 } |
| 3123 break; |
| 3124 |
| 3125 case URX_LA_END: |
| 3126 case URX_LB_CONT: |
| 3127 case URX_LB_END: |
| 3128 case URX_LBN_CONT: |
| 3129 case URX_LBN_END: |
| 3130 // Only come here if the matching URX_LA_START or URX_LB_START was n
ot in the |
| 3131 // range being sized, which happens when measuring size of look-be
hind blocks. |
| 3132 break; |
| 3133 |
| 3134 default: |
| 3135 U_ASSERT(FALSE); |
| 3136 } |
| 3137 |
| 3138 } |
| 3139 |
| 3140 // We have finished walking through the ops. Check whether some forward jum
p |
| 3141 // propagated a shorter length to location end+1. |
| 3142 if (forwardedLength.elementAti(end+1) < currentLen) { |
| 3143 currentLen = forwardedLength.elementAti(end+1); |
| 3144 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); |
| 3145 } |
| 3146 |
| 3147 return currentLen; |
| 3148 } |
| 3149 |
| 3150 |
| 3151 |
| 3152 //------------------------------------------------------------------------------ |
| 3153 // |
| 3154 // maxMatchLength Calculate the length of the longest string that could |
| 3155 // match the specified pattern. |
| 3156 // Length is in 16 bit code units, not code points. |
| 3157 // |
| 3158 // The calculated length may not be exact. The returned |
| 3159 // value may be longer than the actual maximum; it must |
| 3160 // never be shorter. |
| 3161 // |
| 3162 //------------------------------------------------------------------------------ |
| 3163 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { |
| 3164 if (U_FAILURE(*fStatus)) { |
| 3165 return 0; |
| 3166 } |
| 3167 U_ASSERT(start <= end); |
| 3168 U_ASSERT(end < fRXPat->fCompiledPat->size()); |
| 3169 |
| 3170 |
| 3171 int32_t loc; |
| 3172 int32_t op; |
| 3173 int32_t opType; |
| 3174 int32_t currentLen = 0; |
| 3175 UVector32 forwardedLength(end+1, *fStatus); |
| 3176 forwardedLength.setSize(end+1); |
| 3177 |
| 3178 for (loc=start; loc<=end; loc++) { |
| 3179 forwardedLength.setElementAt(0, loc); |
| 3180 } |
| 3181 |
| 3182 for (loc = start; loc<=end; loc++) { |
| 3183 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 3184 opType = URX_TYPE(op); |
| 3185 |
| 3186 // The loop is advancing linearly through the pattern. |
| 3187 // If the op we are now at was the destination of a branch in the patter
n, |
| 3188 // and that path has a longer maximum length than the current accumulate
d value, |
| 3189 // replace the current accumulated value. |
| 3190 if (forwardedLength.elementAti(loc) > currentLen) { |
| 3191 currentLen = forwardedLength.elementAti(loc); |
| 3192 } |
| 3193 |
| 3194 switch (opType) { |
| 3195 // Ops that don't change the total length matched |
| 3196 case URX_RESERVED_OP: |
| 3197 case URX_END: |
| 3198 case URX_STRING_LEN: |
| 3199 case URX_NOP: |
| 3200 case URX_START_CAPTURE: |
| 3201 case URX_END_CAPTURE: |
| 3202 case URX_BACKSLASH_B: |
| 3203 case URX_BACKSLASH_BU: |
| 3204 case URX_BACKSLASH_G: |
| 3205 case URX_BACKSLASH_Z: |
| 3206 case URX_CARET: |
| 3207 case URX_DOLLAR: |
| 3208 case URX_DOLLAR_M: |
| 3209 case URX_DOLLAR_D: |
| 3210 case URX_DOLLAR_MD: |
| 3211 case URX_RELOC_OPRND: |
| 3212 case URX_STO_INP_LOC: |
| 3213 case URX_CARET_M: |
| 3214 case URX_CARET_M_UNIX: |
| 3215 |
| 3216 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. |
| 3217 case URX_LD_SP: |
| 3218 |
| 3219 case URX_LB_END: |
| 3220 case URX_LB_CONT: |
| 3221 case URX_LBN_CONT: |
| 3222 case URX_LBN_END: |
| 3223 break; |
| 3224 |
| 3225 |
| 3226 // Ops that increase that cause an unbounded increase in the length |
| 3227 // of a matched string, or that increase it a hard to characterize
way. |
| 3228 // Call the max length unbounded, and stop further checking. |
| 3229 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match |
| 3230 case URX_BACKREF_I: |
| 3231 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounde
d. |
| 3232 currentLen = INT32_MAX; |
| 3233 break; |
| 3234 |
| 3235 |
| 3236 // Ops that match a max of one character (possibly two 16 bit code u
nits.) |
| 3237 // |
| 3238 case URX_STATIC_SETREF: |
| 3239 case URX_STAT_SETREF_N: |
| 3240 case URX_SETREF: |
| 3241 case URX_BACKSLASH_D: |
| 3242 case URX_ONECHAR_I: |
| 3243 case URX_DOTANY_ALL: |
| 3244 case URX_DOTANY: |
| 3245 case URX_DOTANY_UNIX: |
| 3246 currentLen+=2; |
| 3247 break; |
| 3248 |
| 3249 // Single literal character. Increase current max length by one or
two, |
| 3250 // depending on whether the char is in the supplementary range
. |
| 3251 case URX_ONECHAR: |
| 3252 currentLen++; |
| 3253 if (URX_VAL(op) > 0x10000) { |
| 3254 currentLen++; |
| 3255 } |
| 3256 break; |
| 3257 |
| 3258 // Jumps. |
| 3259 // |
| 3260 case URX_JMP: |
| 3261 case URX_JMPX: |
| 3262 case URX_JMP_SAV: |
| 3263 case URX_JMP_SAV_X: |
| 3264 { |
| 3265 int32_t jmpDest = URX_VAL(op); |
| 3266 if (jmpDest < loc) { |
| 3267 // Loop of some kind. Max match length is unbounded. |
| 3268 currentLen = INT32_MAX; |
| 3269 } else { |
| 3270 // Forward jump. Propagate the current min length to the ta
rget loc of the jump. |
| 3271 if (forwardedLength.elementAti(jmpDest) < currentLen) { |
| 3272 forwardedLength.setElementAt(currentLen, jmpDest); |
| 3273 } |
| 3274 currentLen = 0; |
| 3275 } |
| 3276 } |
| 3277 break; |
| 3278 |
| 3279 case URX_BACKTRACK: |
| 3280 // back-tracks are kind of like a branch, except that the max length
was |
| 3281 // propagated already, by the state save. |
| 3282 currentLen = forwardedLength.elementAti(loc+1); |
| 3283 break; |
| 3284 |
| 3285 |
| 3286 case URX_STATE_SAVE: |
| 3287 { |
| 3288 // State Save, for forward jumps, propagate the current minimum. |
| 3289 // of the state save. |
| 3290 // For backwards jumps, they create a loop, maximum |
| 3291 // match length is unbounded. |
| 3292 int32_t jmpDest = URX_VAL(op); |
| 3293 if (jmpDest > loc) { |
| 3294 if (currentLen > forwardedLength.elementAti(jmpDest)) { |
| 3295 forwardedLength.setElementAt(currentLen, jmpDest); |
| 3296 } |
| 3297 } else { |
| 3298 currentLen = INT32_MAX; |
| 3299 } |
| 3300 } |
| 3301 break; |
| 3302 |
| 3303 |
| 3304 |
| 3305 |
| 3306 case URX_STRING: |
| 3307 case URX_STRING_I: |
| 3308 { |
| 3309 loc++; |
| 3310 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); |
| 3311 currentLen += URX_VAL(stringLenOp); |
| 3312 } |
| 3313 break; |
| 3314 |
| 3315 |
| 3316 case URX_CTR_INIT: |
| 3317 case URX_CTR_INIT_NG: |
| 3318 case URX_CTR_LOOP: |
| 3319 case URX_CTR_LOOP_NG: |
| 3320 case URX_LOOP_SR_I: |
| 3321 case URX_LOOP_DOT_I: |
| 3322 case URX_LOOP_C: |
| 3323 // For anything to do with loops, make the match length unbounded. |
| 3324 // Note: INIT instructions are multi-word. Can ignore because |
| 3325 // INT32_MAX length will stop the per-instruction loop. |
| 3326 currentLen = INT32_MAX; |
| 3327 break; |
| 3328 |
| 3329 |
| 3330 |
| 3331 case URX_LA_START: |
| 3332 case URX_LA_END: |
| 3333 // Look-ahead. Just ignore, treat the look-ahead block as if |
| 3334 // it were normal pattern. Gives a too-long match length, |
| 3335 // but good enough for now. |
| 3336 break; |
| 3337 |
| 3338 // End of look-ahead ops should always be consumed by the processing
at |
| 3339 // the URX_LA_START op. |
| 3340 // U_ASSERT(FALSE); |
| 3341 // break; |
| 3342 |
| 3343 case URX_LB_START: |
| 3344 { |
| 3345 // Look-behind. Scan forward until the matching look-around end
, |
| 3346 // without processing the look-behind block. |
| 3347 int32_t depth = 0; |
| 3348 for (;;) { |
| 3349 loc++; |
| 3350 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 3351 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_S
TART) { |
| 3352 depth++; |
| 3353 } |
| 3354 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END)
{ |
| 3355 if (depth == 0) { |
| 3356 break; |
| 3357 } |
| 3358 depth--; |
| 3359 } |
| 3360 U_ASSERT(loc < end); |
| 3361 } |
| 3362 } |
| 3363 break; |
| 3364 |
| 3365 default: |
| 3366 U_ASSERT(FALSE); |
| 3367 } |
| 3368 |
| 3369 |
| 3370 if (currentLen == INT32_MAX) { |
| 3371 // The maximum length is unbounded. |
| 3372 // Stop further processing of the pattern. |
| 3373 break; |
| 3374 } |
| 3375 |
| 3376 } |
| 3377 return currentLen; |
| 3378 |
| 3379 } |
| 3380 |
| 3381 |
| 3382 //------------------------------------------------------------------------------ |
| 3383 // |
| 3384 // stripNOPs Remove any NOP operations from the compiled pattern code. |
| 3385 // Extra NOPs are inserted for some constructs during the initial |
| 3386 // code generation to provide locations that may be patched later
. |
| 3387 // Many end up unneeded, and are removed by this function. |
| 3388 // |
| 3389 // In order to minimize the number of passes through the pattern, |
| 3390 // back-reference fixup is also performed here (adjusting |
| 3391 // back-reference operands to point to the correct frame offsets)
. |
| 3392 // |
| 3393 // In addition, case-insensitive character and string literals ar
e |
| 3394 // now case-folded here, rather than when first parsed or at matc
h |
| 3395 // time. |
| 3396 // |
| 3397 //------------------------------------------------------------------------------ |
| 3398 void RegexCompile::stripNOPs() { |
| 3399 |
| 3400 if (U_FAILURE(*fStatus)) { |
| 3401 return; |
| 3402 } |
| 3403 |
| 3404 int32_t end = fRXPat->fCompiledPat->size(); |
| 3405 UVector32 deltas(end, *fStatus); |
| 3406 |
| 3407 // Make a first pass over the code, computing the amount that things |
| 3408 // will be offset at each location in the original code. |
| 3409 int32_t loc; |
| 3410 int32_t d = 0; |
| 3411 for (loc=0; loc<end; loc++) { |
| 3412 deltas.addElement(d, *fStatus); |
| 3413 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 3414 if (URX_TYPE(op) == URX_NOP) { |
| 3415 d++; |
| 3416 } |
| 3417 } |
| 3418 |
| 3419 UnicodeString caseStringBuffer; |
| 3420 int32_t stringDelta = 0; |
| 3421 |
| 3422 // Make a second pass over the code, removing the NOPs by moving following |
| 3423 // code up, and patching operands that refer to code locations that |
| 3424 // are being moved. The array of offsets from the first step is used |
| 3425 // to compute the new operand values. |
| 3426 int32_t src; |
| 3427 int32_t dst = 0; |
| 3428 for (src=0; src<end; src++) { |
| 3429 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); |
| 3430 int32_t opType = URX_TYPE(op); |
| 3431 switch (opType) { |
| 3432 case URX_NOP: |
| 3433 break; |
| 3434 |
| 3435 case URX_STATE_SAVE: |
| 3436 case URX_JMP: |
| 3437 case URX_CTR_LOOP: |
| 3438 case URX_CTR_LOOP_NG: |
| 3439 case URX_RELOC_OPRND: |
| 3440 case URX_JMPX: |
| 3441 case URX_JMP_SAV: |
| 3442 case URX_JMP_SAV_X: |
| 3443 // These are instructions with operands that refer to code locations
. |
| 3444 { |
| 3445 int32_t operandAddress = URX_VAL(op); |
| 3446 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); |
| 3447 int32_t fixedOperandAddress = operandAddress - deltas.elementAti
(operandAddress); |
| 3448 op = URX_BUILD(opType, fixedOperandAddress); |
| 3449 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3450 dst++; |
| 3451 break; |
| 3452 } |
| 3453 |
| 3454 case URX_ONECHAR_I: |
| 3455 { |
| 3456 UChar32 c = URX_VAL(op); |
| 3457 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 3458 // We have a cased character to fold |
| 3459 c = u_foldCase(c, U_FOLD_CASE_DEFAULT); |
| 3460 op = URX_BUILD(URX_ONECHAR_I, c); |
| 3461 } |
| 3462 |
| 3463 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3464 dst++; |
| 3465 break; |
| 3466 } |
| 3467 case URX_STRING_I: |
| 3468 { |
| 3469 op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta); |
| 3470 |
| 3471 src++; |
| 3472 int32_t lengthOp = (int32_t)fRXPat->fCompiledPat->elementAti(src
); |
| 3473 |
| 3474 caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VA
L(lengthOp)); |
| 3475 caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT); |
| 3476 |
| 3477 int32_t newLen = caseStringBuffer.length(); |
| 3478 if (newLen <= URX_VAL(lengthOp)) { |
| 3479 // don't shift if we don't have to, take the tiny memory hit
of a smaller string |
| 3480 fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseString
Buffer); |
| 3481 } else { |
| 3482 // shift other strings over...at least UnicodeString handles
this for us! |
| 3483 fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp),
caseStringBuffer); |
| 3484 stringDelta += newLen - URX_VAL(lengthOp); |
| 3485 } |
| 3486 lengthOp = URX_BUILD(URX_STRING_LEN, newLen); |
| 3487 |
| 3488 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3489 fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1); |
| 3490 dst += 2; |
| 3491 break; |
| 3492 } |
| 3493 case URX_BACKREF: |
| 3494 case URX_BACKREF_I: |
| 3495 { |
| 3496 int32_t where = URX_VAL(op); |
| 3497 if (where > fRXPat->fGroupMap->size()) { |
| 3498 error(U_REGEX_INVALID_BACK_REF); |
| 3499 break; |
| 3500 } |
| 3501 where = fRXPat->fGroupMap->elementAti(where-1); |
| 3502 op = URX_BUILD(opType, where); |
| 3503 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3504 dst++; |
| 3505 |
| 3506 fRXPat->fNeedsAltInput = TRUE; |
| 3507 break; |
| 3508 } |
| 3509 case URX_STRING: |
| 3510 op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta); |
| 3511 // continue |
| 3512 case URX_RESERVED_OP: |
| 3513 case URX_RESERVED_OP_N: |
| 3514 case URX_BACKTRACK: |
| 3515 case URX_END: |
| 3516 case URX_ONECHAR: |
| 3517 case URX_STRING_LEN: |
| 3518 case URX_START_CAPTURE: |
| 3519 case URX_END_CAPTURE: |
| 3520 case URX_STATIC_SETREF: |
| 3521 case URX_STAT_SETREF_N: |
| 3522 case URX_SETREF: |
| 3523 case URX_DOTANY: |
| 3524 case URX_FAIL: |
| 3525 case URX_BACKSLASH_B: |
| 3526 case URX_BACKSLASH_BU: |
| 3527 case URX_BACKSLASH_G: |
| 3528 case URX_BACKSLASH_X: |
| 3529 case URX_BACKSLASH_Z: |
| 3530 case URX_DOTANY_ALL: |
| 3531 case URX_BACKSLASH_D: |
| 3532 case URX_CARET: |
| 3533 case URX_DOLLAR: |
| 3534 case URX_CTR_INIT: |
| 3535 case URX_CTR_INIT_NG: |
| 3536 case URX_DOTANY_UNIX: |
| 3537 case URX_STO_SP: |
| 3538 case URX_LD_SP: |
| 3539 case URX_STO_INP_LOC: |
| 3540 case URX_LA_START: |
| 3541 case URX_LA_END: |
| 3542 case URX_DOLLAR_M: |
| 3543 case URX_CARET_M: |
| 3544 case URX_CARET_M_UNIX: |
| 3545 case URX_LB_START: |
| 3546 case URX_LB_CONT: |
| 3547 case URX_LB_END: |
| 3548 case URX_LBN_CONT: |
| 3549 case URX_LBN_END: |
| 3550 case URX_LOOP_SR_I: |
| 3551 case URX_LOOP_DOT_I: |
| 3552 case URX_LOOP_C: |
| 3553 case URX_DOLLAR_D: |
| 3554 case URX_DOLLAR_MD: |
| 3555 // These instructions are unaltered by the relocation. |
| 3556 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3557 dst++; |
| 3558 break; |
| 3559 |
| 3560 default: |
| 3561 // Some op is unaccounted for. |
| 3562 U_ASSERT(FALSE); |
| 3563 error(U_REGEX_INTERNAL_ERROR); |
| 3564 } |
| 3565 } |
| 3566 |
| 3567 fRXPat->fCompiledPat->setSize(dst); |
| 3568 } |
| 3569 |
| 3570 |
| 3571 |
| 3572 |
| 3573 //------------------------------------------------------------------------------ |
| 3574 // |
| 3575 // Error Report a rule parse error. |
| 3576 // Only report it if no previous error has been recorded. |
| 3577 // |
| 3578 //------------------------------------------------------------------------------ |
| 3579 void RegexCompile::error(UErrorCode e) { |
| 3580 if (U_SUCCESS(*fStatus)) { |
| 3581 *fStatus = e; |
| 3582 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in publ
ic |
| 3583 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are |
| 3584 // int64_t. If the values of the latter are out of range for the former, |
| 3585 // set them to the appropriate "field not supported" values. |
| 3586 if (fLineNum > 0x7FFFFFFF) { |
| 3587 fParseErr->line = 0; |
| 3588 fParseErr->offset = -1; |
| 3589 } else if (fCharNum > 0x7FFFFFFF) { |
| 3590 fParseErr->line = (int32_t)fLineNum; |
| 3591 fParseErr->offset = -1; |
| 3592 } else { |
| 3593 fParseErr->line = (int32_t)fLineNum; |
| 3594 fParseErr->offset = (int32_t)fCharNum; |
| 3595 } |
| 3596 |
| 3597 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con
text |
| 3598 |
| 3599 // Fill in the context. |
| 3600 // Note: extractBetween() pins supplied indicies to the string bounds. |
| 3601 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); |
| 3602 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); |
| 3603 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI
ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); |
| 3604 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L
EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); |
| 3605 } |
| 3606 } |
| 3607 |
| 3608 |
| 3609 // |
| 3610 // Assorted Unicode character constants. |
| 3611 // Numeric because there is no portable way to enter them as literals. |
| 3612 // (Think EBCDIC). |
| 3613 // |
| 3614 static const UChar chCR = 0x0d; // New lines, for terminating c
omments. |
| 3615 static const UChar chLF = 0x0a; // Line Feed |
| 3616 static const UChar chPound = 0x23; // '#', introduces a comment. |
| 3617 static const UChar chDigit0 = 0x30; // '0' |
| 3618 static const UChar chDigit7 = 0x37; // '9' |
| 3619 static const UChar chColon = 0x3A; // ':' |
| 3620 static const UChar chE = 0x45; // 'E' |
| 3621 static const UChar chQ = 0x51; // 'Q' |
| 3622 static const UChar chN = 0x4E; // 'N' |
| 3623 static const UChar chP = 0x50; // 'P' |
| 3624 static const UChar chBackSlash = 0x5c; // '\' introduces a char escap
e |
| 3625 static const UChar chLBracket = 0x5b; // '[' |
| 3626 static const UChar chRBracket = 0x5d; // ']' |
| 3627 static const UChar chUp = 0x5e; // '^' |
| 3628 static const UChar chLowerP = 0x70; |
| 3629 static const UChar chLBrace = 0x7b; // '{' |
| 3630 static const UChar chRBrace = 0x7d; // '}' |
| 3631 static const UChar chNEL = 0x85; // NEL newline variant |
| 3632 static const UChar chLS = 0x2028; // Unicode Line Separator |
| 3633 |
| 3634 |
| 3635 //------------------------------------------------------------------------------ |
| 3636 // |
| 3637 // nextCharLL Low Level Next Char from the regex pattern. |
| 3638 // Get a char from the string, keep track of input position |
| 3639 // for error reporting. |
| 3640 // |
| 3641 //------------------------------------------------------------------------------ |
| 3642 UChar32 RegexCompile::nextCharLL() { |
| 3643 UChar32 ch; |
| 3644 |
| 3645 if (fPeekChar != -1) { |
| 3646 ch = fPeekChar; |
| 3647 fPeekChar = -1; |
| 3648 return ch; |
| 3649 } |
| 3650 |
| 3651 // assume we're already in the right place |
| 3652 ch = UTEXT_NEXT32(fRXPat->fPattern); |
| 3653 if (ch == U_SENTINEL) { |
| 3654 return ch; |
| 3655 } |
| 3656 |
| 3657 if (ch == chCR || |
| 3658 ch == chNEL || |
| 3659 ch == chLS || |
| 3660 (ch == chLF && fLastChar != chCR)) { |
| 3661 // Character is starting a new line. Bump up the line number, and |
| 3662 // reset the column to 0. |
| 3663 fLineNum++; |
| 3664 fCharNum=0; |
| 3665 } |
| 3666 else { |
| 3667 // Character is not starting a new line. Except in the case of a |
| 3668 // LF following a CR, increment the column position. |
| 3669 if (ch != chLF) { |
| 3670 fCharNum++; |
| 3671 } |
| 3672 } |
| 3673 fLastChar = ch; |
| 3674 return ch; |
| 3675 } |
| 3676 |
| 3677 //------------------------------------------------------------------------------ |
| 3678 // |
| 3679 // peekCharLL Low Level Character Scanning, sneak a peek at the next |
| 3680 // character without actually getting it. |
| 3681 // |
| 3682 //------------------------------------------------------------------------------ |
| 3683 UChar32 RegexCompile::peekCharLL() { |
| 3684 if (fPeekChar == -1) { |
| 3685 fPeekChar = nextCharLL(); |
| 3686 } |
| 3687 return fPeekChar; |
| 3688 } |
| 3689 |
| 3690 |
| 3691 //------------------------------------------------------------------------------ |
| 3692 // |
| 3693 // nextChar for pattern scanning. At this level, we handle stripping |
| 3694 // out comments and processing some backslash character escapes. |
| 3695 // The rest of the pattern grammar is handled at the next level u
p. |
| 3696 // |
| 3697 //------------------------------------------------------------------------------ |
| 3698 void RegexCompile::nextChar(RegexPatternChar &c) { |
| 3699 |
| 3700 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
| 3701 c.fChar = nextCharLL(); |
| 3702 c.fQuoted = FALSE; |
| 3703 |
| 3704 if (fQuoteMode) { |
| 3705 c.fQuoted = TRUE; |
| 3706 if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-
1) { |
| 3707 fQuoteMode = FALSE; // Exit quote mode, |
| 3708 nextCharLL(); // discard the E |
| 3709 nextChar(c); // recurse to get the real next char |
| 3710 } |
| 3711 } |
| 3712 else if (fInBackslashQuote) { |
| 3713 // The current character immediately follows a '\' |
| 3714 // Don't check for any further escapes, just return it as-is. |
| 3715 // Don't set c.fQuoted, because that would prevent the state machine fro
m |
| 3716 // dispatching on the character. |
| 3717 fInBackslashQuote = FALSE; |
| 3718 } |
| 3719 else |
| 3720 { |
| 3721 // We are not in a \Q quoted region \E of the source. |
| 3722 // |
| 3723 if (fModeFlags & UREGEX_COMMENTS) { |
| 3724 // |
| 3725 // We are in free-spacing and comments mode. |
| 3726 // Scan through any white space and comments, until we |
| 3727 // reach a significant character or the end of inut. |
| 3728 for (;;) { |
| 3729 if (c.fChar == (UChar32)-1) { |
| 3730 break; // End of Input |
| 3731 } |
| 3732 if (c.fChar == chPound && fEOLComments == TRUE) { |
| 3733 // Start of a comment. Consume the rest of it, until EOF or
a new line |
| 3734 for (;;) { |
| 3735 c.fChar = nextCharLL(); |
| 3736 if (c.fChar == (UChar32)-1 || // EOF |
| 3737 c.fChar == chCR || |
| 3738 c.fChar == chLF || |
| 3739 c.fChar == chNEL || |
| 3740 c.fChar == chLS) { |
| 3741 break; |
| 3742 } |
| 3743 } |
| 3744 } |
| 3745 // TODO: check what Java & Perl do with non-ASCII white spaces.
Ticket 6061. |
| 3746 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) { |
| 3747 break; |
| 3748 } |
| 3749 c.fChar = nextCharLL(); |
| 3750 } |
| 3751 } |
| 3752 |
| 3753 // |
| 3754 // check for backslash escaped characters. |
| 3755 // |
| 3756 if (c.fChar == chBackSlash) { |
| 3757 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
| 3758 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar
LL())) { |
| 3759 // |
| 3760 // A '\' sequence that is handled by ICU's standard unescapeAt f
unction. |
| 3761 // Includes \uxxxx, \n, \r, many others. |
| 3762 // Return the single equivalent character. |
| 3763 // |
| 3764 nextCharLL(); // get & discard the peeked char. |
| 3765 c.fQuoted = TRUE; |
| 3766 |
| 3767 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength))
{ |
| 3768 int32_t endIndex = (int32_t)pos; |
| 3769 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd
ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); |
| 3770 |
| 3771 if (endIndex == pos) { |
| 3772 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 3773 } |
| 3774 fCharNum += endIndex - pos; |
| 3775 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); |
| 3776 } else { |
| 3777 int32_t offset = 0; |
| 3778 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX
T_UNESCAPE_CONTEXT(fRXPat->fPattern); |
| 3779 |
| 3780 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); |
| 3781 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset
, INT32_MAX, &context); |
| 3782 |
| 3783 if (offset == 0) { |
| 3784 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 3785 } else if (context.lastOffset == offset) { |
| 3786 UTEXT_PREVIOUS32(fRXPat->fPattern); |
| 3787 } else if (context.lastOffset != offset-1) { |
| 3788 utext_moveIndex32(fRXPat->fPattern, offset - context.las
tOffset - 1); |
| 3789 } |
| 3790 fCharNum += offset; |
| 3791 } |
| 3792 } |
| 3793 else if (peekCharLL() == chDigit0) { |
| 3794 // Octal Escape, using Java Regexp Conventions |
| 3795 // which are \0 followed by 1-3 octal digits. |
| 3796 // Different from ICU Unescape handling of Octal, which does
not |
| 3797 // require the leading 0. |
| 3798 // Java also has the convention of only consuming 2 octal digit
s if |
| 3799 // the three digit number would be > 0xff |
| 3800 // |
| 3801 c.fChar = 0; |
| 3802 nextCharLL(); // Consume the initial 0. |
| 3803 int index; |
| 3804 for (index=0; index<3; index++) { |
| 3805 int32_t ch = peekCharLL(); |
| 3806 if (ch<chDigit0 || ch>chDigit7) { |
| 3807 if (index==0) { |
| 3808 // \0 is not followed by any octal digits. |
| 3809 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 3810 } |
| 3811 break; |
| 3812 } |
| 3813 c.fChar <<= 3; |
| 3814 c.fChar += ch&7; |
| 3815 if (c.fChar <= 255) { |
| 3816 nextCharLL(); |
| 3817 } else { |
| 3818 // The last digit made the number too big. Forget we sa
w it. |
| 3819 c.fChar >>= 3; |
| 3820 } |
| 3821 } |
| 3822 c.fQuoted = TRUE; |
| 3823 } |
| 3824 else if (peekCharLL() == chQ) { |
| 3825 // "\Q" enter quote mode, which will continue until "\E" |
| 3826 fQuoteMode = TRUE; |
| 3827 nextCharLL(); // discard the 'Q'. |
| 3828 nextChar(c); // recurse to get the real next char. |
| 3829 } |
| 3830 else |
| 3831 { |
| 3832 // We are in a '\' escape that will be handled by the state tabl
e scanner. |
| 3833 // Just return the backslash, but remember that the following ch
ar is to |
| 3834 // be taken literally. |
| 3835 fInBackslashQuote = TRUE; |
| 3836 } |
| 3837 } |
| 3838 } |
| 3839 |
| 3840 // re-enable # to end-of-line comments, in case they were disabled. |
| 3841 // They are disabled by the parser upon seeing '(?', but this lasts for |
| 3842 // the fetching of the next character only. |
| 3843 fEOLComments = TRUE; |
| 3844 |
| 3845 // putc(c.fChar, stdout); |
| 3846 } |
| 3847 |
| 3848 |
| 3849 |
| 3850 //------------------------------------------------------------------------------ |
| 3851 // |
| 3852 // scanNamedChar |
| 3853 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. |
| 3854 // |
| 3855 // The scan position will be at the 'N'. On return |
| 3856 // the scan position should be just after the '}' |
| 3857 // |
| 3858 // Return the UChar32 |
| 3859 // |
| 3860 //------------------------------------------------------------------------------ |
| 3861 UChar32 RegexCompile::scanNamedChar() { |
| 3862 if (U_FAILURE(*fStatus)) { |
| 3863 return 0; |
| 3864 } |
| 3865 |
| 3866 nextChar(fC); |
| 3867 if (fC.fChar != chLBrace) { |
| 3868 error(U_REGEX_PROPERTY_SYNTAX); |
| 3869 return 0; |
| 3870 } |
| 3871 |
| 3872 UnicodeString charName; |
| 3873 for (;;) { |
| 3874 nextChar(fC); |
| 3875 if (fC.fChar == chRBrace) { |
| 3876 break; |
| 3877 } |
| 3878 if (fC.fChar == -1) { |
| 3879 error(U_REGEX_PROPERTY_SYNTAX); |
| 3880 return 0; |
| 3881 } |
| 3882 charName.append(fC.fChar); |
| 3883 } |
| 3884 |
| 3885 char name[100]; |
| 3886 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || |
| 3887 (uint32_t)charName.length()>=sizeof(name)) { |
| 3888 // All Unicode character names have only invariant characters. |
| 3889 // The API to get a character, given a name, accepts only char *, forcin
g us to convert, |
| 3890 // which requires this error check |
| 3891 error(U_REGEX_PROPERTY_SYNTAX); |
| 3892 return 0; |
| 3893 } |
| 3894 charName.extract(0, charName.length(), name, sizeof(name), US_INV); |
| 3895 |
| 3896 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); |
| 3897 if (U_FAILURE(*fStatus)) { |
| 3898 error(U_REGEX_PROPERTY_SYNTAX); |
| 3899 } |
| 3900 |
| 3901 nextChar(fC); // Continue overall regex pattern processing with char af
ter the '}' |
| 3902 return theChar; |
| 3903 } |
| 3904 |
| 3905 //------------------------------------------------------------------------------ |
| 3906 // |
| 3907 // scanProp Construct a UnicodeSet from the text at the current scan |
| 3908 // position, which will be of the form \p{whaterver} |
| 3909 // |
| 3910 // The scan position will be at the 'p' or 'P'. On return |
| 3911 // the scan position should be just after the '}' |
| 3912 // |
| 3913 // Return a UnicodeSet, constructed from the \P pattern, |
| 3914 // or NULL if the pattern is invalid. |
| 3915 // |
| 3916 //------------------------------------------------------------------------------ |
| 3917 UnicodeSet *RegexCompile::scanProp() { |
| 3918 UnicodeSet *uset = NULL; |
| 3919 |
| 3920 if (U_FAILURE(*fStatus)) { |
| 3921 return NULL; |
| 3922 } |
| 3923 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); |
| 3924 UBool negated = (fC.fChar == chP); |
| 3925 |
| 3926 UnicodeString propertyName; |
| 3927 nextChar(fC); |
| 3928 if (fC.fChar != chLBrace) { |
| 3929 error(U_REGEX_PROPERTY_SYNTAX); |
| 3930 return NULL; |
| 3931 } |
| 3932 for (;;) { |
| 3933 nextChar(fC); |
| 3934 if (fC.fChar == chRBrace) { |
| 3935 break; |
| 3936 } |
| 3937 if (fC.fChar == -1) { |
| 3938 // Hit the end of the input string without finding the closing '}' |
| 3939 error(U_REGEX_PROPERTY_SYNTAX); |
| 3940 return NULL; |
| 3941 } |
| 3942 propertyName.append(fC.fChar); |
| 3943 } |
| 3944 uset = createSetForProperty(propertyName, negated); |
| 3945 nextChar(fC); // Move input scan to position following the closing '}' |
| 3946 return uset; |
| 3947 } |
| 3948 |
| 3949 //------------------------------------------------------------------------------ |
| 3950 // |
| 3951 // scanPosixProp Construct a UnicodeSet from the text at the current scan |
| 3952 // position, which is expected be of the form [:property expression:
] |
| 3953 // |
| 3954 // The scan position will be at the opening ':'. On return |
| 3955 // the scan position must be on the closing ']' |
| 3956 // |
| 3957 // Return a UnicodeSet constructed from the pattern, |
| 3958 // or NULL if this is not a valid POSIX-style set expression. |
| 3959 // If not a property expression, restore the initial scan position |
| 3960 // (to the opening ':') |
| 3961 // |
| 3962 // Note: the opening '[:' is not sufficient to guarantee that |
| 3963 // this is a [:property:] expression. |
| 3964 // [:'+=,] is a perfectly good ordinary set expression that |
| 3965 // happens to include ':' as one of its characters. |
| 3966 // |
| 3967 //------------------------------------------------------------------------------ |
| 3968 UnicodeSet *RegexCompile::scanPosixProp() { |
| 3969 UnicodeSet *uset = NULL; |
| 3970 |
| 3971 if (U_FAILURE(*fStatus)) { |
| 3972 return NULL; |
| 3973 } |
| 3974 |
| 3975 U_ASSERT(fC.fChar == chColon); |
| 3976 |
| 3977 // Save the scanner state. |
| 3978 // TODO: move this into the scanner, with the state encapsulated in some wa
y. Ticket 6062 |
| 3979 int64_t savedScanIndex = fScanIndex; |
| 3980 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
| 3981 UBool savedQuoteMode = fQuoteMode; |
| 3982 UBool savedInBackslashQuote = fInBackslashQuote; |
| 3983 UBool savedEOLComments = fEOLComments; |
| 3984 int64_t savedLineNum = fLineNum; |
| 3985 int64_t savedCharNum = fCharNum; |
| 3986 UChar32 savedLastChar = fLastChar; |
| 3987 UChar32 savedPeekChar = fPeekChar; |
| 3988 RegexPatternChar savedfC = fC; |
| 3989 |
| 3990 // Scan for a closing ]. A little tricky because there are some perverse |
| 3991 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre
ssion, |
| 3992 // ending on the second closing ]. |
| 3993 |
| 3994 UnicodeString propName; |
| 3995 UBool negated = FALSE; |
| 3996 |
| 3997 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette
r:] |
| 3998 nextChar(fC); |
| 3999 if (fC.fChar == chUp) { |
| 4000 negated = TRUE; |
| 4001 nextChar(fC); |
| 4002 } |
| 4003 |
| 4004 // Scan for the closing ":]", collecting the property name along the way. |
| 4005 UBool sawPropSetTerminator = FALSE; |
| 4006 for (;;) { |
| 4007 propName.append(fC.fChar); |
| 4008 nextChar(fC); |
| 4009 if (fC.fQuoted || fC.fChar == -1) { |
| 4010 // Escaped characters or end of input - either says this isn't a [:P
roperty:] |
| 4011 break; |
| 4012 } |
| 4013 if (fC.fChar == chColon) { |
| 4014 nextChar(fC); |
| 4015 if (fC.fChar == chRBracket) { |
| 4016 sawPropSetTerminator = TRUE; |
| 4017 } |
| 4018 break; |
| 4019 } |
| 4020 } |
| 4021 |
| 4022 if (sawPropSetTerminator) { |
| 4023 uset = createSetForProperty(propName, negated); |
| 4024 } |
| 4025 else |
| 4026 { |
| 4027 // No closing ":]". |
| 4028 // Restore the original scan position. |
| 4029 // The main scanner will retry the input as a normal set expression, |
| 4030 // not a [:Property:] expression. |
| 4031 fScanIndex = savedScanIndex; |
| 4032 fQuoteMode = savedQuoteMode; |
| 4033 fInBackslashQuote = savedInBackslashQuote; |
| 4034 fEOLComments = savedEOLComments; |
| 4035 fLineNum = savedLineNum; |
| 4036 fCharNum = savedCharNum; |
| 4037 fLastChar = savedLastChar; |
| 4038 fPeekChar = savedPeekChar; |
| 4039 fC = savedfC; |
| 4040 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); |
| 4041 } |
| 4042 return uset; |
| 4043 } |
| 4044 |
| 4045 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { |
| 4046 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); |
| 4047 addCategory(set, U_GC_CF_MASK, ec); |
| 4048 } |
| 4049 |
| 4050 // |
| 4051 // Create a Unicode Set from a Unicode Property expression. |
| 4052 // This is common code underlying both \p{...} ane [:...:] expressions. |
| 4053 // Includes trying the Java "properties" that aren't supported as |
| 4054 // normal ICU UnicodeSet properties |
| 4055 // |
| 4056 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" |
| 4057 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" |
| 4058 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
ool negated) { |
| 4059 UnicodeString setExpr; |
| 4060 UnicodeSet *set; |
| 4061 uint32_t usetFlags = 0; |
| 4062 |
| 4063 if (U_FAILURE(*fStatus)) { |
| 4064 return NULL; |
| 4065 } |
| 4066 |
| 4067 // |
| 4068 // First try the property as we received it |
| 4069 // |
| 4070 if (negated) { |
| 4071 setExpr.append(negSetPrefix, -1); |
| 4072 } else { |
| 4073 setExpr.append(posSetPrefix, -1); |
| 4074 } |
| 4075 setExpr.append(propName); |
| 4076 setExpr.append(chRBrace); |
| 4077 setExpr.append(chRBracket); |
| 4078 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 4079 usetFlags |= USET_CASE_INSENSITIVE; |
| 4080 } |
| 4081 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); |
| 4082 if (U_SUCCESS(*fStatus)) { |
| 4083 return set; |
| 4084 } |
| 4085 delete set; |
| 4086 set = NULL; |
| 4087 |
| 4088 // |
| 4089 // The property as it was didn't work. |
| 4090 |
| 4091 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n
ot standard POSIX |
| 4092 // or standard Java, but many other regular expression packages do recog
nize it. |
| 4093 |
| 4094 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { |
| 4095 *fStatus = U_ZERO_ERROR; |
| 4096 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); |
| 4097 if (set == NULL) { |
| 4098 *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| 4099 return set; |
| 4100 } |
| 4101 if (negated) { |
| 4102 set->complement(); |
| 4103 } |
| 4104 return set; |
| 4105 } |
| 4106 |
| 4107 |
| 4108 // Do Java fixes - |
| 4109 // InGreek -> InGreek or Coptic, that being the official Unicode name
for that block. |
| 4110 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols
. |
| 4111 // |
| 4112 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin
g Marks for Symbols" |
| 4113 // is accepted by Java. The property part of the nam
e is compared |
| 4114 // case-insenstively. The spaces must be exactly as
shown, either |
| 4115 // all there, or all omitted, with exactly one at eac
h position |
| 4116 // if they are present. From checking against JDK 1.
6 |
| 4117 // |
| 4118 // This code should be removed when ICU properties support the Java c
ompatibility names |
| 4119 // (ICU 4.0?) |
| 4120 // |
| 4121 UnicodeString mPropName = propName; |
| 4122 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { |
| 4123 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); |
| 4124 } |
| 4125 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo
ls"), 0) == 0 || |
| 4126 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"
), 0) == 0) { |
| 4127 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym
bols"); |
| 4128 } |
| 4129 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { |
| 4130 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); |
| 4131 } |
| 4132 |
| 4133 // See if the property looks like a Java "InBlockName", which |
| 4134 // we will recast as "Block=BlockName" |
| 4135 // |
| 4136 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" |
| 4137 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "
Block=" |
| 4138 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { |
| 4139 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" |
| 4140 setExpr.append(BLOCK, -1); |
| 4141 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi
ng "In" removed. |
| 4142 setExpr.append(chRBrace); |
| 4143 setExpr.append(chRBracket); |
| 4144 *fStatus = U_ZERO_ERROR; |
| 4145 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); |
| 4146 if (U_SUCCESS(*fStatus)) { |
| 4147 return set; |
| 4148 } |
| 4149 delete set; |
| 4150 set = NULL; |
| 4151 } |
| 4152 |
| 4153 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || |
| 4154 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) |
| 4155 { |
| 4156 UErrorCode localStatus = U_ZERO_ERROR; |
| 4157 //setExpr.remove(); |
| 4158 set = new UnicodeSet(); |
| 4159 // |
| 4160 // Try the various Java specific properties. |
| 4161 // These all begin with "java" |
| 4162 // |
| 4163 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { |
| 4164 addCategory(set, U_GC_CN_MASK, localStatus); |
| 4165 set->complement(); |
| 4166 } |
| 4167 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { |
| 4168 addCategory(set, U_GC_ND_MASK, localStatus); |
| 4169 } |
| 4170 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorabl
e")) == 0) { |
| 4171 addIdentifierIgnorable(set, localStatus); |
| 4172 } |
| 4173 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0
) { |
| 4174 set->add(0, 0x1F).add(0x7F, 0x9F); |
| 4175 } |
| 4176 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart
")) == 0) { |
| 4177 addCategory(set, U_GC_L_MASK, localStatus); |
| 4178 addCategory(set, U_GC_SC_MASK, localStatus); |
| 4179 addCategory(set, U_GC_PC_MASK, localStatus); |
| 4180 addCategory(set, U_GC_ND_MASK, localStatus); |
| 4181 addCategory(set, U_GC_NL_MASK, localStatus); |
| 4182 addCategory(set, U_GC_MC_MASK, localStatus); |
| 4183 addCategory(set, U_GC_MN_MASK, localStatus); |
| 4184 addIdentifierIgnorable(set, localStatus); |
| 4185 } |
| 4186 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStar
t")) == 0) { |
| 4187 addCategory(set, U_GC_L_MASK, localStatus); |
| 4188 addCategory(set, U_GC_NL_MASK, localStatus); |
| 4189 addCategory(set, U_GC_SC_MASK, localStatus); |
| 4190 addCategory(set, U_GC_PC_MASK, localStatus); |
| 4191 } |
| 4192 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { |
| 4193 addCategory(set, U_GC_L_MASK, localStatus); |
| 4194 } |
| 4195 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) =
= 0) { |
| 4196 addCategory(set, U_GC_L_MASK, localStatus); |
| 4197 addCategory(set, U_GC_ND_MASK, localStatus); |
| 4198 } |
| 4199 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0)
{ |
| 4200 addCategory(set, U_GC_LL_MASK, localStatus); |
| 4201 } |
| 4202 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0)
{ |
| 4203 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); |
| 4204 } |
| 4205 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0)
{ |
| 4206 addCategory(set, U_GC_Z_MASK, localStatus); |
| 4207 } |
| 4208 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodeP
oint")) == 0) { |
| 4209 set->add(0x10000, UnicodeSet::MAX_VALUE); |
| 4210 } |
| 4211 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0)
{ |
| 4212 addCategory(set, U_GC_LT_MASK, localStatus); |
| 4213 } |
| 4214 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierS
tart")) == 0) { |
| 4215 addCategory(set, U_GC_L_MASK, localStatus); |
| 4216 addCategory(set, U_GC_NL_MASK, localStatus); |
| 4217 } |
| 4218 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierP
art")) == 0) { |
| 4219 addCategory(set, U_GC_L_MASK, localStatus); |
| 4220 addCategory(set, U_GC_PC_MASK, localStatus); |
| 4221 addCategory(set, U_GC_ND_MASK, localStatus); |
| 4222 addCategory(set, U_GC_NL_MASK, localStatus); |
| 4223 addCategory(set, U_GC_MC_MASK, localStatus); |
| 4224 addCategory(set, U_GC_MN_MASK, localStatus); |
| 4225 addIdentifierIgnorable(set, localStatus); |
| 4226 } |
| 4227 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0)
{ |
| 4228 addCategory(set, U_GC_LU_MASK, localStatus); |
| 4229 } |
| 4230 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint"))
== 0) { |
| 4231 set->add(0, UnicodeSet::MAX_VALUE); |
| 4232 } |
| 4233 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0
) { |
| 4234 addCategory(set, U_GC_Z_MASK, localStatus); |
| 4235 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); |
| 4236 set->add(9, 0x0d).add(0x1c, 0x1f); |
| 4237 } |
| 4238 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { |
| 4239 set->add(0, UnicodeSet::MAX_VALUE); |
| 4240 } |
| 4241 |
| 4242 if (U_SUCCESS(localStatus) && !set->isEmpty()) { |
| 4243 *fStatus = U_ZERO_ERROR; |
| 4244 if (usetFlags & USET_CASE_INSENSITIVE) { |
| 4245 set->closeOver(USET_CASE_INSENSITIVE); |
| 4246 } |
| 4247 if (negated) { |
| 4248 set->complement(); |
| 4249 } |
| 4250 return set; |
| 4251 } |
| 4252 delete set; |
| 4253 set = NULL; |
| 4254 } |
| 4255 error(*fStatus); |
| 4256 return NULL; |
| 4257 } |
| 4258 |
| 4259 |
| 4260 |
| 4261 // |
| 4262 // SetEval Part of the evaluation of [set expressions]. |
| 4263 // Perform any pending (stacked) operations with precedence |
| 4264 // equal or greater to that of the next operator encountered |
| 4265 // in the expression. |
| 4266 // |
| 4267 void RegexCompile::setEval(int32_t nextOp) { |
| 4268 UnicodeSet *rightOperand = NULL; |
| 4269 UnicodeSet *leftOperand = NULL; |
| 4270 for (;;) { |
| 4271 U_ASSERT(fSetOpStack.empty()==FALSE); |
| 4272 int32_t pendingSetOperation = fSetOpStack.peeki(); |
| 4273 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { |
| 4274 break; |
| 4275 } |
| 4276 fSetOpStack.popi(); |
| 4277 U_ASSERT(fSetStack.empty() == FALSE); |
| 4278 rightOperand = (UnicodeSet *)fSetStack.peek(); |
| 4279 switch (pendingSetOperation) { |
| 4280 case setNegation: |
| 4281 rightOperand->complement(); |
| 4282 break; |
| 4283 case setCaseClose: |
| 4284 // TODO: need a simple close function. Ticket 6065 |
| 4285 rightOperand->closeOver(USET_CASE_INSENSITIVE); |
| 4286 rightOperand->removeAllStrings(); |
| 4287 break; |
| 4288 case setDifference1: |
| 4289 case setDifference2: |
| 4290 fSetStack.pop(); |
| 4291 leftOperand = (UnicodeSet *)fSetStack.peek(); |
| 4292 leftOperand->removeAll(*rightOperand); |
| 4293 delete rightOperand; |
| 4294 break; |
| 4295 case setIntersection1: |
| 4296 case setIntersection2: |
| 4297 fSetStack.pop(); |
| 4298 leftOperand = (UnicodeSet *)fSetStack.peek(); |
| 4299 leftOperand->retainAll(*rightOperand); |
| 4300 delete rightOperand; |
| 4301 break; |
| 4302 case setUnion: |
| 4303 fSetStack.pop(); |
| 4304 leftOperand = (UnicodeSet *)fSetStack.peek(); |
| 4305 leftOperand->addAll(*rightOperand); |
| 4306 delete rightOperand; |
| 4307 break; |
| 4308 default: |
| 4309 U_ASSERT(FALSE); |
| 4310 break; |
| 4311 } |
| 4312 } |
| 4313 } |
| 4314 |
| 4315 void RegexCompile::setPushOp(int32_t op) { |
| 4316 setEval(op); |
| 4317 fSetOpStack.push(op, *fStatus); |
| 4318 fSetStack.push(new UnicodeSet(), *fStatus); |
| 4319 } |
| 4320 |
| 4321 U_NAMESPACE_END |
| 4322 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 4323 |
OLD | NEW |