| OLD | NEW |
| 1 // | 1 // |
| 2 // file: regexcmp.cpp | 2 // file: regexcmp.cpp |
| 3 // | 3 // |
| 4 // Copyright (C) 2002-2013 International Business Machines Corporation and othe
rs. | 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. |
| 5 // All Rights Reserved. | 5 // All Rights Reserved. |
| 6 // | 6 // |
| 7 // This file contains the ICU regular expression compiler, which is responsible | 7 // This file contains the ICU regular expression compiler, which is responsible |
| 8 // for processing a regular expression pattern into the compiled form that | 8 // for processing a regular expression pattern into the compiled form that |
| 9 // is used by the match finding engine. | 9 // is used by the match finding engine. |
| 10 // | 10 // |
| 11 | 11 |
| 12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
| 13 | 13 |
| 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 15 | 15 |
| 16 #include "unicode/ustring.h" | 16 #include "unicode/ustring.h" |
| 17 #include "unicode/unistr.h" | 17 #include "unicode/unistr.h" |
| 18 #include "unicode/uniset.h" | 18 #include "unicode/uniset.h" |
| 19 #include "unicode/uchar.h" | 19 #include "unicode/uchar.h" |
| 20 #include "unicode/uchriter.h" | 20 #include "unicode/uchriter.h" |
| 21 #include "unicode/parsepos.h" | 21 #include "unicode/parsepos.h" |
| 22 #include "unicode/parseerr.h" | 22 #include "unicode/parseerr.h" |
| 23 #include "unicode/regex.h" | 23 #include "unicode/regex.h" |
| 24 #include "unicode/utf.h" | 24 #include "unicode/utf.h" |
| 25 #include "unicode/utf16.h" | 25 #include "unicode/utf16.h" |
| 26 #include "patternprops.h" | 26 #include "patternprops.h" |
| 27 #include "putilimp.h" | 27 #include "putilimp.h" |
| 28 #include "cmemory.h" | 28 #include "cmemory.h" |
| 29 #include "cstring.h" | 29 #include "cstring.h" |
| 30 #include "uvectr32.h" | 30 #include "uvectr32.h" |
| 31 #include "uvectr64.h" | 31 #include "uvectr64.h" |
| 32 #include "uassert.h" | 32 #include "uassert.h" |
| 33 #include "ucln_in.h" | |
| 34 #include "uinvchar.h" | 33 #include "uinvchar.h" |
| 35 | 34 |
| 36 #include "regeximp.h" | 35 #include "regeximp.h" |
| 37 #include "regexcst.h" // Contains state table for the regex pattern parser. | 36 #include "regexcst.h" // Contains state table for the regex pattern parser. |
| 38 // generated by a Perl script. | 37 // generated by a Perl script. |
| 39 #include "regexcmp.h" | 38 #include "regexcmp.h" |
| 40 #include "regexst.h" | 39 #include "regexst.h" |
| 41 #include "regextxt.h" | 40 #include "regextxt.h" |
| 42 | 41 |
| 43 | 42 |
| (...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 102 // | 101 // |
| 103 //------------------------------------------------------------------------------ | 102 //------------------------------------------------------------------------------ |
| 104 void RegexCompile::compile( | 103 void RegexCompile::compile( |
| 105 const UnicodeString &pat, // Source pat to be compile
d. | 104 const UnicodeString &pat, // Source pat to be compile
d. |
| 106 UParseError &pp, // Error position info | 105 UParseError &pp, // Error position info |
| 107 UErrorCode &e) // Error Code | 106 UErrorCode &e) // Error Code |
| 108 { | 107 { |
| 109 fRXPat->fPatternString = new UnicodeString(pat); | 108 fRXPat->fPatternString = new UnicodeString(pat); |
| 110 UText patternText = UTEXT_INITIALIZER; | 109 UText patternText = UTEXT_INITIALIZER; |
| 111 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); | 110 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); |
| 112 | 111 |
| 113 if (U_SUCCESS(e)) { | 112 if (U_SUCCESS(e)) { |
| 114 compile(&patternText, pp, e); | 113 compile(&patternText, pp, e); |
| 115 utext_close(&patternText); | 114 utext_close(&patternText); |
| 116 } | 115 } |
| 117 } | 116 } |
| 118 | 117 |
| 119 // | 118 // |
| 120 // compile, UText mode | 119 // compile, UText mode |
| 121 // All the work is actually done here. | 120 // All the work is actually done here. |
| 122 // | 121 // |
| (...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 295 n *= 10; | 294 n *= 10; |
| 296 } | 295 } |
| 297 | 296 |
| 298 // | 297 // |
| 299 // The pattern's fFrameSize so far has accumulated the requirements for | 298 // The pattern's fFrameSize so far has accumulated the requirements for |
| 300 // storage for capture parentheses, counters, etc. that are encountered | 299 // storage for capture parentheses, counters, etc. that are encountered |
| 301 // in the pattern. Add space for the two variables that are always | 300 // in the pattern. Add space for the two variables that are always |
| 302 // present in the saved state: the input string position (int64_t) and | 301 // present in the saved state: the input string position (int64_t) and |
| 303 // the position in the compiled pattern. | 302 // the position in the compiled pattern. |
| 304 // | 303 // |
| 305 allocateStackData(RESTACKFRAME_HDRCOUNT); | 304 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; |
| 306 | 305 |
| 307 // | 306 // |
| 308 // Optimization pass 1: NOPs, back-references, and case-folding | 307 // Optimization pass 1: NOPs, back-references, and case-folding |
| 309 // | 308 // |
| 310 stripNOPs(); | 309 stripNOPs(); |
| 311 | 310 |
| 312 // | 311 // |
| 313 // Get bounds for the minimum and maximum length of a string that this | 312 // Get bounds for the minimum and maximum length of a string that this |
| 314 // pattern can match. Used to avoid looking for matches in strings that | 313 // pattern can match. Used to avoid looking for matches in strings that |
| 315 // are too short. | 314 // are too short. |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 361 | 360 |
| 362 case doPatStart: | 361 case doPatStart: |
| 363 // Start of pattern compiles to: | 362 // Start of pattern compiles to: |
| 364 //0 SAVE 2 Fall back to position of FAIL | 363 //0 SAVE 2 Fall back to position of FAIL |
| 365 //1 jmp 3 | 364 //1 jmp 3 |
| 366 //2 FAIL Stop if we ever reach here. | 365 //2 FAIL Stop if we ever reach here. |
| 367 //3 NOP Dummy, so start of pattern looks the same as | 366 //3 NOP Dummy, so start of pattern looks the same as |
| 368 // the start of an ( grouping. | 367 // the start of an ( grouping. |
| 369 //4 NOP Resreved, will be replaced by a save if there are | 368 //4 NOP Resreved, will be replaced by a save if there are |
| 370 // OR | operators at the top level | 369 // OR | operators at the top level |
| 371 appendOp(URX_BUILD(URX_STATE_SAVE, 2)); | 370 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus)
; |
| 372 appendOp(URX_BUILD(URX_JMP, 3)); | 371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); |
| 373 appendOp(URX_BUILD(URX_FAIL, 0)); | 372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); |
| 374 | 373 |
| 375 // Standard open nonCapture paren action emits the two NOPs and | 374 // Standard open nonCapture paren action emits the two NOPs and |
| 376 // sets up the paren stack frame. | 375 // sets up the paren stack frame. |
| 377 doParseActions(doOpenNonCaptureParen); | 376 doParseActions(doOpenNonCaptureParen); |
| 378 break; | 377 break; |
| 379 | 378 |
| 380 case doPatFinish: | 379 case doPatFinish: |
| 381 // We've scanned to the end of the pattern | 380 // We've scanned to the end of the pattern |
| 382 // The end of pattern compiles to: | 381 // The end of pattern compiles to: |
| 383 // URX_END | 382 // URX_END |
| 384 // which will stop the runtime match engine. | 383 // which will stop the runtime match engine. |
| 385 // Encountering end of pattern also behaves like a close paren, | 384 // Encountering end of pattern also behaves like a close paren, |
| 386 // and forces fixups of the State Save at the beginning of the compile
d pattern | 385 // and forces fixups of the State Save at the beginning of the compile
d pattern |
| 387 // and of any OR operations at the top level. | 386 // and of any OR operations at the top level. |
| 388 // | 387 // |
| 389 handleCloseParen(); | 388 handleCloseParen(); |
| 390 if (fParenStack.size() > 0) { | 389 if (fParenStack.size() > 0) { |
| 391 // Missing close paren in pattern. | 390 // Missing close paren in pattern. |
| 392 error(U_REGEX_MISMATCHED_PAREN); | 391 error(U_REGEX_MISMATCHED_PAREN); |
| 393 } | 392 } |
| 394 | 393 |
| 395 // add the END operation to the compiled pattern. | 394 // add the END operation to the compiled pattern. |
| 396 appendOp(URX_BUILD(URX_END, 0)); | 395 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); |
| 397 | 396 |
| 398 // Terminate the pattern compilation state machine. | 397 // Terminate the pattern compilation state machine. |
| 399 returnVal = FALSE; | 398 returnVal = FALSE; |
| 400 break; | 399 break; |
| 401 | 400 |
| 402 | 401 |
| 403 | 402 |
| 404 case doOrOperator: | 403 case doOrOperator: |
| 405 // Scanning a '|', as in (A|B) | 404 // Scanning a '|', as in (A|B) |
| 406 { | 405 { |
| 407 // Generate code for any pending literals preceding the '|' | 406 // Generate code for any pending literals preceding the '|' |
| 408 fixLiterals(FALSE); | 407 fixLiterals(FALSE); |
| 409 | 408 |
| 410 // Insert a SAVE operation at the start of the pattern section prece
ding | 409 // Insert a SAVE operation at the start of the pattern section prece
ding |
| 411 // this OR at this level. This SAVE will branch the match forward | 410 // this OR at this level. This SAVE will branch the match forward |
| 412 // to the right hand side of the OR in the event that the left han
d | 411 // to the right hand side of the OR in the event that the left han
d |
| 413 // side fails to match and backtracks. Locate the position for th
e | 412 // side fails to match and backtracks. Locate the position for th
e |
| 414 // save from the location on the top of the parentheses stack. | 413 // save from the location on the top of the parentheses stack. |
| 415 int32_t savePosition = fParenStack.popi(); | 414 int32_t savePosition = fParenStack.popi(); |
| 416 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; | 415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; |
| 417 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location | 416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location |
| 418 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | 417 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
| 419 fRXPat->fCompiledPat->setElementAt(op, savePosition); | 418 fRXPat->fCompiledPat->setElementAt(op, savePosition); |
| 420 | 419 |
| 421 // Append an JMP operation into the compiled pattern. The operand f
or | 420 // Append an JMP operation into the compiled pattern. The operand f
or |
| 422 // the JMP will eventually be the location following the ')' for th
e | 421 // the JMP will eventually be the location following the ')' for th
e |
| 423 // group. This will be patched in later, when the ')' is encounter
ed. | 422 // group. This will be patched in later, when the ')' is encounter
ed. |
| 424 op = URX_BUILD(URX_JMP, 0); | 423 op = URX_BUILD(URX_JMP, 0); |
| 425 appendOp(op); | 424 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 426 | 425 |
| 427 // Push the position of the newly added JMP op onto the parentheses
stack. | 426 // Push the position of the newly added JMP op onto the parentheses
stack. |
| 428 // This registers if for fixup when this block's close paren is enco
untered. | 427 // This registers if for fixup when this block's close paren is enco
untered. |
| 429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 430 | 429 |
| 431 // Append a NOP to the compiled pattern. This is the slot reserved | 430 // Append a NOP to the compiled pattern. This is the slot reserved |
| 432 // for a SAVE in the event that there is yet another '|' following | 431 // for a SAVE in the event that there is yet another '|' following |
| 433 // this one. | 432 // this one. |
| 434 appendOp(URX_BUILD(URX_NOP, 0)); | 433 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 435 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 434 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 436 } | 435 } |
| 437 break; | 436 break; |
| 438 | 437 |
| 439 | 438 |
| 440 case doOpenCaptureParen: | 439 case doOpenCaptureParen: |
| 441 // Open Paren. | 440 // Open Paren. |
| 442 // Compile to a | 441 // Compile to a |
| 443 // - NOP, which later may be replaced by a save-state if the | 442 // - NOP, which later may be replaced by a save-state if the |
| 444 // parenthesized group gets a * quantifier, followed by | 443 // parenthesized group gets a * quantifier, followed by |
| 445 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. | 444 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
| 446 // - NOP, which may later be replaced by a save-state if there | 445 // - NOP, which may later be replaced by a save-state if there |
| 447 // is an '|' alternation within the parens. | 446 // is an '|' alternation within the parens. |
| 448 // | 447 // |
| 449 // Each capture group gets three slots in the save stack frame: | 448 // Each capture group gets three slots in the save stack frame: |
| 450 // 0: Capture Group start position (in input string being matche
d.) | 449 // 0: Capture Group start position (in input string being matche
d.) |
| 451 // 1: Capture Group end position. | 450 // 1: Capture Group end position. |
| 452 // 2: Start of Match-in-progress. | 451 // 2: Start of Match-in-progress. |
| 453 // The first two locations are for a completed capture group, and are | 452 // The first two locations are for a completed capture group, and are |
| 454 // referred to by back references and the like. | 453 // referred to by back references and the like. |
| 455 // The third location stores the capture start position when an START
_CAPTURE is | 454 // The third location stores the capture start position when an START
_CAPTURE is |
| 456 // encountered. This will be promoted to a completed capture when
(and if) the corresponding | 455 // encountered. This will be promoted to a completed capture when
(and if) the corresponding |
| 457 // END_CAPTURE is encountered. | 456 // END_CAPTURE is encountered. |
| 458 { | 457 { |
| 459 fixLiterals(); | 458 fixLiterals(); |
| 460 appendOp(URX_BUILD(URX_NOP, 0)); | 459 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 461 int32_t varsLoc = allocateStackData(3); // Reserve three slots i
n match stack frame. | 460 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. |
| 462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); | 461 fRXPat->fFrameSize += 3; |
| 463 appendOp(cop); | 462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); |
| 464 appendOp(URX_BUILD(URX_NOP, 0)); | 463 fRXPat->fCompiledPat->addElement(cop, *fStatus); |
| 464 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 465 | 465 |
| 466 // On the Parentheses stack, start a new frame and add the postions | 466 // On the Parentheses stack, start a new frame and add the postions |
| 467 // of the two NOPs. Depending on what follows in the pattern, the | 467 // of the two NOPs. Depending on what follows in the pattern, the |
| 468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 469 // address of the end of the parenthesized group. | 469 // address of the end of the parenthesized group. |
| 470 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 470 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 471 fParenStack.push(capturing, *fStatus); // Fra
me type. | 471 fParenStack.push(capturing, *fStatus); // Fra
me type. |
| 472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location | 472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
| 473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 474 | 474 |
| 475 // Save the mapping from group number to stack frame variable positi
on. | 475 // Save the mapping from group number to stack frame variable positi
on. |
| 476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
| 477 } | 477 } |
| 478 break; | 478 break; |
| 479 | 479 |
| 480 case doOpenNonCaptureParen: | 480 case doOpenNonCaptureParen: |
| 481 // Open non-caputuring (grouping only) Paren. | 481 // Open non-caputuring (grouping only) Paren. |
| 482 // Compile to a | 482 // Compile to a |
| 483 // - NOP, which later may be replaced by a save-state if the | 483 // - NOP, which later may be replaced by a save-state if the |
| 484 // parenthesized group gets a * quantifier, followed by | 484 // parenthesized group gets a * quantifier, followed by |
| 485 // - NOP, which may later be replaced by a save-state if there | 485 // - NOP, which may later be replaced by a save-state if there |
| 486 // is an '|' alternation within the parens. | 486 // is an '|' alternation within the parens. |
| 487 { | 487 { |
| 488 fixLiterals(); | 488 fixLiterals(); |
| 489 appendOp(URX_BUILD(URX_NOP, 0)); | 489 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 490 appendOp(URX_BUILD(URX_NOP, 0)); | 490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 491 | 491 |
| 492 // On the Parentheses stack, start a new frame and add the postions | 492 // On the Parentheses stack, start a new frame and add the postions |
| 493 // of the two NOPs. | 493 // of the two NOPs. |
| 494 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 494 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 495 fParenStack.push(plain, *fStatus); // Beg
in a new frame. | 495 fParenStack.push(plain, *fStatus); // Beg
in a new frame. |
| 496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 498 } | 498 } |
| 499 break; | 499 break; |
| 500 | 500 |
| 501 | 501 |
| 502 case doOpenAtomicParen: | 502 case doOpenAtomicParen: |
| 503 // Open Atomic Paren. (?> | 503 // Open Atomic Paren. (?> |
| 504 // Compile to a | 504 // Compile to a |
| 505 // - NOP, which later may be replaced if the parenthesized group | 505 // - NOP, which later may be replaced if the parenthesized group |
| 506 // has a quantifier, followed by | 506 // has a quantifier, followed by |
| 507 // - STO_SP save state stack position, so it can be restored at th
e ")" | 507 // - STO_SP save state stack position, so it can be restored at th
e ")" |
| 508 // - NOP, which may later be replaced by a save-state if there | 508 // - NOP, which may later be replaced by a save-state if there |
| 509 // is an '|' alternation within the parens. | 509 // is an '|' alternation within the parens. |
| 510 { | 510 { |
| 511 fixLiterals(); | 511 fixLiterals(); |
| 512 appendOp(URX_BUILD(URX_NOP, 0)); | 512 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 513 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the state stack ptr. | 513 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the |
| 514 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); | 514 fRXPat->fDataSize += 1; // state stack ptr. |
| 515 appendOp(stoOp); | 515 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); |
| 516 appendOp(URX_BUILD(URX_NOP, 0)); | 516 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); |
| 517 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 517 | 518 |
| 518 // On the Parentheses stack, start a new frame and add the postions | 519 // On the Parentheses stack, start a new frame and add the postions |
| 519 // of the two NOPs. Depending on what follows in the pattern, the | 520 // of the two NOPs. Depending on what follows in the pattern, the |
| 520 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 521 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 521 // address of the end of the parenthesized group. | 522 // address of the end of the parenthesized group. |
| 522 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 523 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 523 fParenStack.push(atomic, *fStatus); // Fra
me type. | 524 fParenStack.push(atomic, *fStatus); // Fra
me type. |
| 524 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP | 525 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP |
| 525 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
| 526 } | 527 } |
| (...skipping 22 matching lines...) Expand all Loading... |
| 549 // 6. NOP reserved for use by quantifiers on the block
. | 550 // 6. NOP reserved for use by quantifiers on the block
. |
| 550 // Look-ahead can't have quantifiers, but paren
stack | 551 // Look-ahead can't have quantifiers, but paren
stack |
| 551 // compile time conventions require the slot
anyhow. | 552 // compile time conventions require the slot
anyhow. |
| 552 // 7. NOP may be replaced if there is are '|' ops in t
he block. | 553 // 7. NOP may be replaced if there is are '|' ops in t
he block. |
| 553 // 8. code for parenthesized stuff. | 554 // 8. code for parenthesized stuff. |
| 554 // 9. LA_END | 555 // 9. LA_END |
| 555 // | 556 // |
| 556 // Two data slots are reserved, for saving the stack ptr and the input
position. | 557 // Two data slots are reserved, for saving the stack ptr and the input
position. |
| 557 { | 558 { |
| 558 fixLiterals(); | 559 fixLiterals(); |
| 559 int32_t dataLoc = allocateData(2); | 560 int32_t dataLoc = fRXPat->fDataSize; |
| 561 fRXPat->fDataSize += 2; |
| 560 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 562 int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
| 561 appendOp(op); | 563 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 562 | 564 |
| 563 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | 565 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
| 564 appendOp(op); | 566 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 565 | 567 |
| 566 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | 568 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
| 567 appendOp(op); | 569 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 568 | 570 |
| 569 op = URX_BUILD(URX_LA_END, dataLoc); | 571 op = URX_BUILD(URX_LA_END, dataLoc); |
| 570 appendOp(op); | 572 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 571 | 573 |
| 572 op = URX_BUILD(URX_BACKTRACK, 0); | 574 op = URX_BUILD(URX_BACKTRACK, 0); |
| 573 appendOp(op); | 575 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 574 | 576 |
| 575 op = URX_BUILD(URX_NOP, 0); | 577 op = URX_BUILD(URX_NOP, 0); |
| 576 appendOp(op); | 578 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 577 appendOp(op); | 579 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 578 | 580 |
| 579 // On the Parentheses stack, start a new frame and add the postions | 581 // On the Parentheses stack, start a new frame and add the postions |
| 580 // of the NOPs. | 582 // of the NOPs. |
| 581 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 583 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 582 fParenStack.push(lookAhead, *fStatus); // Fra
me type. | 584 fParenStack.push(lookAhead, *fStatus); // Fra
me type. |
| 583 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 585 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 584 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 586 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
| 585 } | 587 } |
| 586 break; | 588 break; |
| 587 | 589 |
| 588 case doOpenLookAheadNeg: | 590 case doOpenLookAheadNeg: |
| 589 // Negated Lookahead. (?! stuff ) | 591 // Negated Lookahead. (?! stuff ) |
| 590 // Compiles to | 592 // Compiles to |
| 591 // 1. START_LA dataloc | 593 // 1. START_LA dataloc |
| 592 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, | 594 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, |
| 593 // // which continues with the match. | 595 // // which continues with the match. |
| 594 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' | 596 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' |
| 595 // 4. code for parenthesized stuff. | 597 // 4. code for parenthesized stuff. |
| 596 // 5. END_LA // Cut back stack, remove saved state
from step 2. | 598 // 5. END_LA // Cut back stack, remove saved state
from step 2. |
| 597 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. | 599 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. |
| 598 // 7. END_LA // Restore match region, in case look-
ahead was using | 600 // 7. END_LA // Restore match region, in case look-
ahead was using |
| 599 // an alternate (transparent) reg
ion. | 601 // an alternate (transparent) reg
ion. |
| 600 { | 602 { |
| 601 fixLiterals(); | 603 fixLiterals(); |
| 602 int32_t dataLoc = allocateData(2); | 604 int32_t dataLoc = fRXPat->fDataSize; |
| 605 fRXPat->fDataSize += 2; |
| 603 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 606 int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
| 604 appendOp(op); | 607 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 605 | 608 |
| 606 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. | 609 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. |
| 607 appendOp(op); | 610 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 608 | 611 |
| 609 op = URX_BUILD(URX_NOP, 0); | 612 op = URX_BUILD(URX_NOP, 0); |
| 610 appendOp(op); | 613 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 611 | 614 |
| 612 // On the Parentheses stack, start a new frame and add the postions | 615 // On the Parentheses stack, start a new frame and add the postions |
| 613 // of the StateSave and NOP. | 616 // of the StateSave and NOP. |
| 614 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 617 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 615 fParenStack.push(negLookAhead, *fStatus); // Fram
e type | 618 fParenStack.push(negLookAhead, *fStatus); // Fram
e type |
| 616 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location | 619 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location |
| 617 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 620 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
| 618 | 621 |
| 619 // Instructions #5 - #7 will be added when the ')' is encountered. | 622 // Instructions #5 - #7 will be added when the ')' is encountered. |
| 620 } | 623 } |
| (...skipping 17 matching lines...) Expand all Loading... |
| 638 // Allocate a block of matcher data, to contain (when runni
ng a match) | 641 // Allocate a block of matcher data, to contain (when runni
ng a match) |
| 639 // 0: Stack ptr on entry | 642 // 0: Stack ptr on entry |
| 640 // 1: Input Index on entry | 643 // 1: Input Index on entry |
| 641 // 2: Start index of match current match attempt. | 644 // 2: Start index of match current match attempt. |
| 642 // 3: Original Input String len. | 645 // 3: Original Input String len. |
| 643 | 646 |
| 644 // Generate match code for any pending literals. | 647 // Generate match code for any pending literals. |
| 645 fixLiterals(); | 648 fixLiterals(); |
| 646 | 649 |
| 647 // Allocate data space | 650 // Allocate data space |
| 648 int32_t dataLoc = allocateData(4); | 651 int32_t dataLoc = fRXPat->fDataSize; |
| 652 fRXPat->fDataSize += 4; |
| 649 | 653 |
| 650 // Emit URX_LB_START | 654 // Emit URX_LB_START |
| 651 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 655 int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
| 652 appendOp(op); | 656 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 653 | 657 |
| 654 // Emit URX_LB_CONT | 658 // Emit URX_LB_CONT |
| 655 op = URX_BUILD(URX_LB_CONT, dataLoc); | 659 op = URX_BUILD(URX_LB_CONT, dataLoc); |
| 656 appendOp(op); | 660 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 657 appendOp(0); // MinMatchLength. To be filled later. | 661 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. |
| 658 appendOp(0); // MaxMatchLength. To be filled later. | 662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. |
| 659 | 663 |
| 660 // Emit the NOP | 664 // Emit the NOP |
| 661 op = URX_BUILD(URX_NOP, 0); | 665 op = URX_BUILD(URX_NOP, 0); |
| 662 appendOp(op); | 666 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 663 appendOp(op); | 667 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 664 | 668 |
| 665 // On the Parentheses stack, start a new frame and add the postions | 669 // On the Parentheses stack, start a new frame and add the postions |
| 666 // of the URX_LB_CONT and the NOP. | 670 // of the URX_LB_CONT and the NOP. |
| 667 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 671 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 668 fParenStack.push(lookBehind, *fStatus); // Fra
me type | 672 fParenStack.push(lookBehind, *fStatus); // Fra
me type |
| 669 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 673 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 670 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 674 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
| 671 | 675 |
| 672 // The final two instructions will be added when the ')' is encounte
red. | 676 // The final two instructions will be added when the ')' is encounte
red. |
| 673 } | 677 } |
| (...skipping 19 matching lines...) Expand all Loading... |
| 693 // Allocate a block of matcher data, to contain (when runni
ng a match) | 697 // Allocate a block of matcher data, to contain (when runni
ng a match) |
| 694 // 0: Stack ptr on entry | 698 // 0: Stack ptr on entry |
| 695 // 1: Input Index on entry | 699 // 1: Input Index on entry |
| 696 // 2: Start index of match current match attempt. | 700 // 2: Start index of match current match attempt. |
| 697 // 3: Original Input String len. | 701 // 3: Original Input String len. |
| 698 | 702 |
| 699 // Generate match code for any pending literals. | 703 // Generate match code for any pending literals. |
| 700 fixLiterals(); | 704 fixLiterals(); |
| 701 | 705 |
| 702 // Allocate data space | 706 // Allocate data space |
| 703 int32_t dataLoc = allocateData(4); | 707 int32_t dataLoc = fRXPat->fDataSize; |
| 708 fRXPat->fDataSize += 4; |
| 704 | 709 |
| 705 // Emit URX_LB_START | 710 // Emit URX_LB_START |
| 706 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 711 int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
| 707 appendOp(op); | 712 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 708 | 713 |
| 709 // Emit URX_LBN_CONT | 714 // Emit URX_LBN_CONT |
| 710 op = URX_BUILD(URX_LBN_CONT, dataLoc); | 715 op = URX_BUILD(URX_LBN_CONT, dataLoc); |
| 711 appendOp(op); | 716 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 712 appendOp(0); // MinMatchLength. To be filled later. | 717 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. |
| 713 appendOp(0); // MaxMatchLength. To be filled later. | 718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. |
| 714 appendOp(0); // Continue Loc. To be filled later. | 719 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. |
| 715 | 720 |
| 716 // Emit the NOP | 721 // Emit the NOP |
| 717 op = URX_BUILD(URX_NOP, 0); | 722 op = URX_BUILD(URX_NOP, 0); |
| 718 appendOp(op); | 723 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 719 appendOp(op); | 724 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 720 | 725 |
| 721 // On the Parentheses stack, start a new frame and add the postions | 726 // On the Parentheses stack, start a new frame and add the postions |
| 722 // of the URX_LB_CONT and the NOP. | 727 // of the URX_LB_CONT and the NOP. |
| 723 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 728 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 724 fParenStack.push(lookBehindN, *fStatus); // Fra
me type | 729 fParenStack.push(lookBehindN, *fStatus); // Fra
me type |
| 725 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 730 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 726 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 731 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
| 727 | 732 |
| 728 // The final two instructions will be added when the ')' is encounte
red. | 733 // The final two instructions will be added when the ')' is encounte
red. |
| 729 } | 734 } |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 780 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 | 785 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 |
| 781 int32_t frameLoc; | 786 int32_t frameLoc; |
| 782 | 787 |
| 783 // Check for simple constructs, which may get special optimized code
. | 788 // Check for simple constructs, which may get special optimized code
. |
| 784 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 789 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
| 785 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 790 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
| 786 | 791 |
| 787 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 792 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 788 // Emit optimized code for [char set]+ | 793 // Emit optimized code for [char set]+ |
| 789 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 794 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); |
| 790 appendOp(loopOpI); | 795 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
| 791 frameLoc = allocateStackData(1); | 796 frameLoc = fRXPat->fFrameSize; |
| 797 fRXPat->fFrameSize++; |
| 792 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | 798 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
| 793 appendOp(loopOpC); | 799 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 794 break; | 800 break; |
| 795 } | 801 } |
| 796 | 802 |
| 797 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 803 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
| 798 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 804 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 799 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 805 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 800 // Emit Optimized code for .+ operations. | 806 // Emit Optimized code for .+ operations. |
| 801 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 807 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
| 802 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 808 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 803 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. | 809 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. |
| 804 loopOpI |= 1; | 810 loopOpI |= 1; |
| 805 } | 811 } |
| 806 if (fModeFlags & UREGEX_UNIX_LINES) { | 812 if (fModeFlags & UREGEX_UNIX_LINES) { |
| 807 loopOpI |= 2; | 813 loopOpI |= 2; |
| 808 } | 814 } |
| 809 appendOp(loopOpI); | 815 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
| 810 frameLoc = allocateStackData(1); | 816 frameLoc = fRXPat->fFrameSize; |
| 817 fRXPat->fFrameSize++; |
| 811 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | 818 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
| 812 appendOp(loopOpC); | 819 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 813 break; | 820 break; |
| 814 } | 821 } |
| 815 | 822 |
| 816 } | 823 } |
| 817 | 824 |
| 818 // General case. | 825 // General case. |
| 819 | 826 |
| 820 // Check for minimum match length of zero, which requires | 827 // Check for minimum match length of zero, which requires |
| 821 // extra loop-breaking code. | 828 // extra loop-breaking code. |
| 822 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { | 829 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { |
| 823 // Zero length match is possible. | 830 // Zero length match is possible. |
| 824 // Emit the code sequence that can handle it. | 831 // Emit the code sequence that can handle it. |
| 825 insertOp(topLoc); | 832 insertOp(topLoc); |
| 826 frameLoc = allocateStackData(1); | 833 frameLoc = fRXPat->fFrameSize; |
| 834 fRXPat->fFrameSize++; |
| 827 | 835 |
| 828 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | 836 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); |
| 829 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 837 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 830 | 838 |
| 831 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | 839 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); |
| 832 appendOp(op); | 840 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 833 } else { | 841 } else { |
| 834 // Simpler code when the repeated body must match something non-
empty | 842 // Simpler code when the repeated body must match something non-
empty |
| 835 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); | 843 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); |
| 836 appendOp(jmpOp); | 844 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| 837 } | 845 } |
| 838 } | 846 } |
| 839 break; | 847 break; |
| 840 | 848 |
| 841 case doNGPlus: | 849 case doNGPlus: |
| 842 // Non-greedy '+?' compiles to | 850 // Non-greedy '+?' compiles to |
| 843 // 1. stuff to be repeated (already built) | 851 // 1. stuff to be repeated (already built) |
| 844 // 2. state-save 1 | 852 // 2. state-save 1 |
| 845 // 3. ... | 853 // 3. ... |
| 846 { | 854 { |
| 847 int32_t topLoc = blockTopLoc(FALSE); | 855 int32_t topLoc = blockTopLoc(FALSE); |
| 848 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | 856 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); |
| 849 appendOp(saveStateOp); | 857 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); |
| 850 } | 858 } |
| 851 break; | 859 break; |
| 852 | 860 |
| 853 | 861 |
| 854 case doOpt: | 862 case doOpt: |
| 855 // Normal (greedy) ? quantifier. | 863 // Normal (greedy) ? quantifier. |
| 856 // Compiles to | 864 // Compiles to |
| 857 // 1. state save 3 | 865 // 1. state save 3 |
| 858 // 2. body of optional block | 866 // 2. body of optional block |
| 859 // 3. ... | 867 // 3. ... |
| (...skipping 16 matching lines...) Expand all Loading... |
| 876 // This code is less than ideal, with two jmps instead of one, because
we can only | 884 // This code is less than ideal, with two jmps instead of one, because
we can only |
| 877 // insert one instruction at the top of the block being iterated. | 885 // insert one instruction at the top of the block being iterated. |
| 878 { | 886 { |
| 879 int32_t jmp1_loc = blockTopLoc(TRUE); | 887 int32_t jmp1_loc = blockTopLoc(TRUE); |
| 880 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); | 888 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
| 881 | 889 |
| 882 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); | 890 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); |
| 883 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | 891 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
| 884 | 892 |
| 885 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); | 893 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); |
| 886 appendOp(jmp2_op); | 894 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); |
| 887 | 895 |
| 888 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | 896 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); |
| 889 appendOp(save_op); | 897 fRXPat->fCompiledPat->addElement(save_op, *fStatus); |
| 890 } | 898 } |
| 891 break; | 899 break; |
| 892 | 900 |
| 893 | 901 |
| 894 case doStar: | 902 case doStar: |
| 895 // Normal (greedy) * quantifier. | 903 // Normal (greedy) * quantifier. |
| 896 // Compiles to | 904 // Compiles to |
| 897 // 1. STATE_SAVE 4 | 905 // 1. STATE_SAVE 4 |
| 898 // 2. body of stuff being iterated over | 906 // 2. body of stuff being iterated over |
| 899 // 3. JMP_SAV 2 | 907 // 3. JMP_SAV 2 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 921 | 929 |
| 922 // Check for simple *, where the construct being repeated | 930 // Check for simple *, where the construct being repeated |
| 923 // compiled to single opcode, and might be optimizable. | 931 // compiled to single opcode, and might be optimizable. |
| 924 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 932 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
| 925 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
| 926 | 934 |
| 927 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 935 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 928 // Emit optimized code for a [char set]* | 936 // Emit optimized code for a [char set]* |
| 929 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 937 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); |
| 930 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 931 dataLoc = allocateStackData(1); | 939 dataLoc = fRXPat->fFrameSize; |
| 940 fRXPat->fFrameSize++; |
| 932 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | 941 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
| 933 appendOp(loopOpC); | 942 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 934 break; | 943 break; |
| 935 } | 944 } |
| 936 | 945 |
| 937 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 946 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
| 938 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 947 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 939 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 948 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 940 // Emit Optimized code for .* operations. | 949 // Emit Optimized code for .* operations. |
| 941 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 950 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
| 942 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 951 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 943 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. | 952 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. |
| 944 loopOpI |= 1; | 953 loopOpI |= 1; |
| 945 } | 954 } |
| 946 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { | 955 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { |
| 947 loopOpI |= 2; | 956 loopOpI |= 2; |
| 948 } | 957 } |
| 949 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 958 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 950 dataLoc = allocateStackData(1); | 959 dataLoc = fRXPat->fFrameSize; |
| 960 fRXPat->fFrameSize++; |
| 951 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | 961 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
| 952 appendOp(loopOpC); | 962 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 953 break; | 963 break; |
| 954 } | 964 } |
| 955 } | 965 } |
| 956 | 966 |
| 957 // Emit general case code for this * | 967 // Emit general case code for this * |
| 958 // The optimizations did not apply. | 968 // The optimizations did not apply. |
| 959 | 969 |
| 960 int32_t saveStateLoc = blockTopLoc(TRUE); | 970 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 961 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | 971 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); |
| 962 | 972 |
| 963 // Check for minimum match length of zero, which requires | 973 // Check for minimum match length of zero, which requires |
| 964 // extra loop-breaking code. | 974 // extra loop-breaking code. |
| 965 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { | 975 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { |
| 966 insertOp(saveStateLoc); | 976 insertOp(saveStateLoc); |
| 967 dataLoc = allocateStackData(1); | 977 dataLoc = fRXPat->fFrameSize; |
| 978 fRXPat->fFrameSize++; |
| 968 | 979 |
| 969 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | 980 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); |
| 970 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | 981 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); |
| 971 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | 982 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); |
| 972 } | 983 } |
| 973 | 984 |
| 974 // Locate the position in the compiled pattern where the match will
continue | 985 // Locate the position in the compiled pattern where the match will
continue |
| 975 // after completing the *. (4 or 5 in the comment above) | 986 // after completing the *. (4 or 5 in the comment above) |
| 976 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 987 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 977 | 988 |
| 978 // Put together the save state op store it into the compiled code. | 989 // Put together the save state op store it into the compiled code. |
| 979 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | 990 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| 980 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 991 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 981 | 992 |
| 982 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. | 993 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. |
| 983 appendOp(jmpOp); | 994 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| 984 } | 995 } |
| 985 break; | 996 break; |
| 986 | 997 |
| 987 case doNGStar: | 998 case doNGStar: |
| 988 // Non-greedy *? quantifier | 999 // Non-greedy *? quantifier |
| 989 // compiles to | 1000 // compiles to |
| 990 // 1. JMP 3 | 1001 // 1. JMP 3 |
| 991 // 2. body of stuff being iterated over | 1002 // 2. body of stuff being iterated over |
| 992 // 3. STATE_SAVE 2 | 1003 // 3. STATE_SAVE 2 |
| 993 // 4 ... | 1004 // 4 ... |
| 994 { | 1005 { |
| 995 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. | 1006 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. |
| 996 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. | 1007 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. |
| 997 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); | 1008 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); |
| 998 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | 1009 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); |
| 999 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | 1010 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
| 1000 appendOp(stateSaveOp); | 1011 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); |
| 1001 } | 1012 } |
| 1002 break; | 1013 break; |
| 1003 | 1014 |
| 1004 | 1015 |
| 1005 case doIntervalInit: | 1016 case doIntervalInit: |
| 1006 // The '{' opening an interval quantifier was just scanned. | 1017 // The '{' opening an interval quantifier was just scanned. |
| 1007 // Init the counter varaiables that will accumulate the values as the di
gits | 1018 // Init the counter varaiables that will accumulate the values as the di
gits |
| 1008 // are scanned. | 1019 // are scanned. |
| 1009 fIntervalLow = 0; | 1020 fIntervalLow = 0; |
| 1010 fIntervalUpper = -1; | 1021 fIntervalUpper = -1; |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1059 int32_t topLoc = blockTopLoc(FALSE); | 1070 int32_t topLoc = blockTopLoc(FALSE); |
| 1060 | 1071 |
| 1061 // Produce normal looping code. | 1072 // Produce normal looping code. |
| 1062 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); | 1073 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); |
| 1063 | 1074 |
| 1064 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP | 1075 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP |
| 1065 // just as if the loop was inclosed in atomic parentheses. | 1076 // just as if the loop was inclosed in atomic parentheses. |
| 1066 | 1077 |
| 1067 // First the STO_SP before the start of the loop | 1078 // First the STO_SP before the start of the loop |
| 1068 insertOp(topLoc); | 1079 insertOp(topLoc); |
| 1069 | 1080 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the |
| 1070 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the | 1081 fRXPat->fDataSize += 1; // state stack ptr. |
| 1071 int32_t op = URX_BUILD(URX_STO_SP, varLoc); | 1082 int32_t op = URX_BUILD(URX_STO_SP, varLoc); |
| 1072 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1083 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1073 | 1084 |
| 1074 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); | 1085 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); |
| 1075 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); | 1086 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); |
| 1076 loopOp++; // point LoopOp after the just-inserted STO_SP | 1087 loopOp++; // point LoopOp after the just-inserted STO_SP |
| 1077 fRXPat->fCompiledPat->push(loopOp, *fStatus); | 1088 fRXPat->fCompiledPat->push(loopOp, *fStatus); |
| 1078 | 1089 |
| 1079 // Then the LD_SP after the end of the loop | 1090 // Then the LD_SP after the end of the loop |
| 1080 op = URX_BUILD(URX_LD_SP, varLoc); | 1091 op = URX_BUILD(URX_LD_SP, varLoc); |
| 1081 appendOp(op); | 1092 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1082 } | 1093 } |
| 1083 | 1094 |
| 1084 break; | 1095 break; |
| 1085 | 1096 |
| 1086 case doNGInterval: | 1097 case doNGInterval: |
| 1087 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. | 1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. |
| 1088 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); | 1099 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); |
| 1089 break; | 1100 break; |
| 1090 | 1101 |
| 1091 case doIntervalError: | 1102 case doIntervalError: |
| (...skipping 23 matching lines...) Expand all Loading... |
| 1115 { | 1126 { |
| 1116 fixLiterals(FALSE); | 1127 fixLiterals(FALSE); |
| 1117 int32_t op; | 1128 int32_t op; |
| 1118 if (fModeFlags & UREGEX_DOTALL) { | 1129 if (fModeFlags & UREGEX_DOTALL) { |
| 1119 op = URX_BUILD(URX_DOTANY_ALL, 0); | 1130 op = URX_BUILD(URX_DOTANY_ALL, 0); |
| 1120 } else if (fModeFlags & UREGEX_UNIX_LINES) { | 1131 } else if (fModeFlags & UREGEX_UNIX_LINES) { |
| 1121 op = URX_BUILD(URX_DOTANY_UNIX, 0); | 1132 op = URX_BUILD(URX_DOTANY_UNIX, 0); |
| 1122 } else { | 1133 } else { |
| 1123 op = URX_BUILD(URX_DOTANY, 0); | 1134 op = URX_BUILD(URX_DOTANY, 0); |
| 1124 } | 1135 } |
| 1125 appendOp(op); | 1136 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1126 } | 1137 } |
| 1127 break; | 1138 break; |
| 1128 | 1139 |
| 1129 case doCaret: | 1140 case doCaret: |
| 1130 { | 1141 { |
| 1131 fixLiterals(FALSE); | 1142 fixLiterals(FALSE); |
| 1132 int32_t op = 0; | 1143 int32_t op = 0; |
| 1133 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1144 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1134 op = URX_CARET; | 1145 op = URX_CARET; |
| 1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1146 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1136 op = URX_CARET_M; | 1147 op = URX_CARET_M; |
| 1137 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1148 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1138 op = URX_CARET; // Only testing true start of input. | 1149 op = URX_CARET; // Only testing true start of input. |
| 1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1140 op = URX_CARET_M_UNIX; | 1151 op = URX_CARET_M_UNIX; |
| 1141 } | 1152 } |
| 1142 appendOp(URX_BUILD(op, 0)); | 1153 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 1143 } | 1154 } |
| 1144 break; | 1155 break; |
| 1145 | 1156 |
| 1146 case doDollar: | 1157 case doDollar: |
| 1147 { | 1158 { |
| 1148 fixLiterals(FALSE); | 1159 fixLiterals(FALSE); |
| 1149 int32_t op = 0; | 1160 int32_t op = 0; |
| 1150 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1161 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1151 op = URX_DOLLAR; | 1162 op = URX_DOLLAR; |
| 1152 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1163 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1153 op = URX_DOLLAR_M; | 1164 op = URX_DOLLAR_M; |
| 1154 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1165 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1155 op = URX_DOLLAR_D; | 1166 op = URX_DOLLAR_D; |
| 1156 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1167 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1157 op = URX_DOLLAR_MD; | 1168 op = URX_DOLLAR_MD; |
| 1158 } | 1169 } |
| 1159 appendOp(URX_BUILD(op, 0)); | 1170 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 1160 } | 1171 } |
| 1161 break; | 1172 break; |
| 1162 | 1173 |
| 1163 case doBackslashA: | 1174 case doBackslashA: |
| 1164 fixLiterals(FALSE); | 1175 fixLiterals(FALSE); |
| 1165 appendOp(URX_BUILD(URX_CARET, 0)); | 1176 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); |
| 1166 break; | 1177 break; |
| 1167 | 1178 |
| 1168 case doBackslashB: | 1179 case doBackslashB: |
| 1169 { | 1180 { |
| 1170 #if UCONFIG_NO_BREAK_ITERATION==1 | 1181 #if UCONFIG_NO_BREAK_ITERATION==1 |
| 1171 if (fModeFlags & UREGEX_UWORD) { | 1182 if (fModeFlags & UREGEX_UWORD) { |
| 1172 error(U_UNSUPPORTED_ERROR); | 1183 error(U_UNSUPPORTED_ERROR); |
| 1173 } | 1184 } |
| 1174 #endif | 1185 #endif |
| 1175 fixLiterals(FALSE); | 1186 fixLiterals(FALSE); |
| 1176 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1187 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
| 1177 appendOp(URX_BUILD(op, 1)); | 1188 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); |
| 1178 } | 1189 } |
| 1179 break; | 1190 break; |
| 1180 | 1191 |
| 1181 case doBackslashb: | 1192 case doBackslashb: |
| 1182 { | 1193 { |
| 1183 #if UCONFIG_NO_BREAK_ITERATION==1 | 1194 #if UCONFIG_NO_BREAK_ITERATION==1 |
| 1184 if (fModeFlags & UREGEX_UWORD) { | 1195 if (fModeFlags & UREGEX_UWORD) { |
| 1185 error(U_UNSUPPORTED_ERROR); | 1196 error(U_UNSUPPORTED_ERROR); |
| 1186 } | 1197 } |
| 1187 #endif | 1198 #endif |
| 1188 fixLiterals(FALSE); | 1199 fixLiterals(FALSE); |
| 1189 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1200 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
| 1190 appendOp(URX_BUILD(op, 0)); | 1201 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 1191 } | 1202 } |
| 1192 break; | 1203 break; |
| 1193 | 1204 |
| 1194 case doBackslashD: | 1205 case doBackslashD: |
| 1195 fixLiterals(FALSE); | 1206 fixLiterals(FALSE); |
| 1196 appendOp(URX_BUILD(URX_BACKSLASH_D, 1)); | 1207 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus
); |
| 1197 break; | 1208 break; |
| 1198 | 1209 |
| 1199 case doBackslashd: | 1210 case doBackslashd: |
| 1200 fixLiterals(FALSE); | 1211 fixLiterals(FALSE); |
| 1201 appendOp(URX_BUILD(URX_BACKSLASH_D, 0)); | 1212 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus
); |
| 1202 break; | 1213 break; |
| 1203 | 1214 |
| 1204 case doBackslashG: | 1215 case doBackslashG: |
| 1205 fixLiterals(FALSE); | 1216 fixLiterals(FALSE); |
| 1206 appendOp(URX_BUILD(URX_BACKSLASH_G, 0)); | 1217 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus
); |
| 1207 break; | 1218 break; |
| 1208 | 1219 |
| 1209 case doBackslashS: | 1220 case doBackslashS: |
| 1210 fixLiterals(FALSE); | 1221 fixLiterals(FALSE); |
| 1211 appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET)); | 1222 fRXPat->fCompiledPat->addElement( |
| 1223 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); |
| 1212 break; | 1224 break; |
| 1213 | 1225 |
| 1214 case doBackslashs: | 1226 case doBackslashs: |
| 1215 fixLiterals(FALSE); | 1227 fixLiterals(FALSE); |
| 1216 appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET)); | 1228 fRXPat->fCompiledPat->addElement( |
| 1229 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); |
| 1217 break; | 1230 break; |
| 1218 | 1231 |
| 1219 case doBackslashW: | 1232 case doBackslashW: |
| 1220 fixLiterals(FALSE); | 1233 fixLiterals(FALSE); |
| 1221 appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET)); | 1234 fRXPat->fCompiledPat->addElement( |
| 1235 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); |
| 1222 break; | 1236 break; |
| 1223 | 1237 |
| 1224 case doBackslashw: | 1238 case doBackslashw: |
| 1225 fixLiterals(FALSE); | 1239 fixLiterals(FALSE); |
| 1226 appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET)); | 1240 fRXPat->fCompiledPat->addElement( |
| 1241 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); |
| 1227 break; | 1242 break; |
| 1228 | 1243 |
| 1229 case doBackslashX: | 1244 case doBackslashX: |
| 1230 fixLiterals(FALSE); | 1245 fixLiterals(FALSE); |
| 1231 appendOp(URX_BUILD(URX_BACKSLASH_X, 0)); | 1246 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus
); |
| 1232 break; | 1247 break; |
| 1233 | 1248 |
| 1234 | 1249 |
| 1235 case doBackslashZ: | 1250 case doBackslashZ: |
| 1236 fixLiterals(FALSE); | 1251 fixLiterals(FALSE); |
| 1237 appendOp(URX_BUILD(URX_DOLLAR, 0)); | 1252 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); |
| 1238 break; | 1253 break; |
| 1239 | 1254 |
| 1240 case doBackslashz: | 1255 case doBackslashz: |
| 1241 fixLiterals(FALSE); | 1256 fixLiterals(FALSE); |
| 1242 appendOp(URX_BUILD(URX_BACKSLASH_Z, 0)); | 1257 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus
); |
| 1243 break; | 1258 break; |
| 1244 | 1259 |
| 1245 case doEscapeError: | 1260 case doEscapeError: |
| 1246 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 1261 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 1247 break; | 1262 break; |
| 1248 | 1263 |
| 1249 case doExit: | 1264 case doExit: |
| 1250 fixLiterals(FALSE); | 1265 fixLiterals(FALSE); |
| 1251 returnVal = FALSE; | 1266 returnVal = FALSE; |
| 1252 break; | 1267 break; |
| 1253 | 1268 |
| 1254 case doProperty: | 1269 case doProperty: |
| 1255 { | 1270 { |
| 1256 fixLiterals(FALSE); | 1271 fixLiterals(FALSE); |
| 1257 UnicodeSet *theSet = scanProp(); | 1272 UnicodeSet *theSet = scanProp(); |
| 1258 compileSet(theSet); | 1273 compileSet(theSet); |
| 1259 } | 1274 } |
| 1260 break; | 1275 break; |
| 1261 | 1276 |
| 1262 case doNamedChar: | 1277 case doNamedChar: |
| 1263 { | 1278 { |
| 1264 UChar32 c = scanNamedChar(); | 1279 UChar32 c = scanNamedChar(); |
| 1265 literalChar(c); | 1280 literalChar(c); |
| 1266 } | 1281 } |
| 1267 break; | 1282 break; |
| 1268 | 1283 |
| 1269 | 1284 |
| 1270 case doBackRef: | 1285 case doBackRef: |
| 1271 // BackReference. Somewhat unusual in that the front-end can not comple
tely parse | 1286 // BackReference. Somewhat unusual in that the front-end can not comple
tely parse |
| 1272 // the regular expression, because the number of digits
to be consumed | 1287 // the regular expression, because the number of digits
to be consumed |
| 1273 // depends on the number of capture groups that have bee
n defined. So | 1288 // depends on the number of capture groups that have bee
n defined. So |
| 1274 // we have to do it here instead. | 1289 // we have to do it here instead. |
| 1275 { | 1290 { |
| 1276 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); | 1291 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); |
| 1277 int32_t groupNum = 0; | 1292 int32_t groupNum = 0; |
| 1278 UChar32 c = fC.fChar; | 1293 UChar32 c = fC.fChar; |
| (...skipping 19 matching lines...) Expand all Loading... |
| 1298 // of compilation, it will be changed to the variable's location. | 1313 // of compilation, it will be changed to the variable's location. |
| 1299 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, | 1314 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, |
| 1300 // and shouldn't enter this code path at
all. | 1315 // and shouldn't enter this code path at
all. |
| 1301 fixLiterals(FALSE); | 1316 fixLiterals(FALSE); |
| 1302 int32_t op; | 1317 int32_t op; |
| 1303 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1318 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1304 op = URX_BUILD(URX_BACKREF_I, groupNum); | 1319 op = URX_BUILD(URX_BACKREF_I, groupNum); |
| 1305 } else { | 1320 } else { |
| 1306 op = URX_BUILD(URX_BACKREF, groupNum); | 1321 op = URX_BUILD(URX_BACKREF, groupNum); |
| 1307 } | 1322 } |
| 1308 appendOp(op); | 1323 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1309 } | 1324 } |
| 1310 break; | 1325 break; |
| 1311 | 1326 |
| 1312 | 1327 |
| 1313 case doPossessivePlus: | 1328 case doPossessivePlus: |
| 1314 // Possessive ++ quantifier. | 1329 // Possessive ++ quantifier. |
| 1315 // Compiles to | 1330 // Compiles to |
| 1316 // 1. STO_SP | 1331 // 1. STO_SP |
| 1317 // 2. body of stuff being iterated over | 1332 // 2. body of stuff being iterated over |
| 1318 // 3. STATE_SAVE 5 | 1333 // 3. STATE_SAVE 5 |
| 1319 // 4. JMP 2 | 1334 // 4. JMP 2 |
| 1320 // 5. LD_SP | 1335 // 5. LD_SP |
| 1321 // 6. ... | 1336 // 6. ... |
| 1322 // | 1337 // |
| 1323 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up | 1338 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up |
| 1324 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 | 1339 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 |
| 1325 // | 1340 // |
| 1326 { | 1341 { |
| 1327 // Emit the STO_SP | 1342 // Emit the STO_SP |
| 1328 int32_t topLoc = blockTopLoc(TRUE); | 1343 int32_t topLoc = blockTopLoc(TRUE); |
| 1329 int32_t stoLoc = allocateData(1); // Reserve the data location fo
r storing save stack ptr. | 1344 int32_t stoLoc = fRXPat->fDataSize; |
| 1345 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 1330 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | 1346 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 1331 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1347 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1332 | 1348 |
| 1333 // Emit the STATE_SAVE | 1349 // Emit the STATE_SAVE |
| 1334 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | 1350 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
| 1335 appendOp(op); | 1351 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1336 | 1352 |
| 1337 // Emit the JMP | 1353 // Emit the JMP |
| 1338 op = URX_BUILD(URX_JMP, topLoc+1); | 1354 op = URX_BUILD(URX_JMP, topLoc+1); |
| 1339 appendOp(op); | 1355 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1340 | 1356 |
| 1341 // Emit the LD_SP | 1357 // Emit the LD_SP |
| 1342 op = URX_BUILD(URX_LD_SP, stoLoc); | 1358 op = URX_BUILD(URX_LD_SP, stoLoc); |
| 1343 appendOp(op); | 1359 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1344 } | 1360 } |
| 1345 break; | 1361 break; |
| 1346 | 1362 |
| 1347 case doPossessiveStar: | 1363 case doPossessiveStar: |
| 1348 // Possessive *+ quantifier. | 1364 // Possessive *+ quantifier. |
| 1349 // Compiles to | 1365 // Compiles to |
| 1350 // 1. STO_SP loc | 1366 // 1. STO_SP loc |
| 1351 // 2. STATE_SAVE 5 | 1367 // 2. STATE_SAVE 5 |
| 1352 // 3. body of stuff being iterated over | 1368 // 3. body of stuff being iterated over |
| 1353 // 4. JMP 2 | 1369 // 4. JMP 2 |
| 1354 // 5. LD_SP loc | 1370 // 5. LD_SP loc |
| 1355 // 6 ... | 1371 // 6 ... |
| 1356 // TODO: do something to cut back the state stack each time through the
loop. | 1372 // TODO: do something to cut back the state stack each time through the
loop. |
| 1357 { | 1373 { |
| 1358 // Reserve two slots at the top of the block. | 1374 // Reserve two slots at the top of the block. |
| 1359 int32_t topLoc = blockTopLoc(TRUE); | 1375 int32_t topLoc = blockTopLoc(TRUE); |
| 1360 insertOp(topLoc); | 1376 insertOp(topLoc); |
| 1361 | 1377 |
| 1362 // emit STO_SP loc | 1378 // emit STO_SP loc |
| 1363 int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. | 1379 int32_t stoLoc = fRXPat->fDataSize; |
| 1380 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 1364 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | 1381 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 1365 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1382 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1366 | 1383 |
| 1367 // Emit the SAVE_STATE 5 | 1384 // Emit the SAVE_STATE 5 |
| 1368 int32_t L7 = fRXPat->fCompiledPat->size()+1; | 1385 int32_t L7 = fRXPat->fCompiledPat->size()+1; |
| 1369 op = URX_BUILD(URX_STATE_SAVE, L7); | 1386 op = URX_BUILD(URX_STATE_SAVE, L7); |
| 1370 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1387 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 1371 | 1388 |
| 1372 // Append the JMP operation. | 1389 // Append the JMP operation. |
| 1373 op = URX_BUILD(URX_JMP, topLoc+1); | 1390 op = URX_BUILD(URX_JMP, topLoc+1); |
| 1374 appendOp(op); | 1391 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1375 | 1392 |
| 1376 // Emit the LD_SP loc | 1393 // Emit the LD_SP loc |
| 1377 op = URX_BUILD(URX_LD_SP, stoLoc); | 1394 op = URX_BUILD(URX_LD_SP, stoLoc); |
| 1378 appendOp(op); | 1395 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1379 } | 1396 } |
| 1380 break; | 1397 break; |
| 1381 | 1398 |
| 1382 case doPossessiveOpt: | 1399 case doPossessiveOpt: |
| 1383 // Possessive ?+ quantifier. | 1400 // Possessive ?+ quantifier. |
| 1384 // Compiles to | 1401 // Compiles to |
| 1385 // 1. STO_SP loc | 1402 // 1. STO_SP loc |
| 1386 // 2. SAVE_STATE 5 | 1403 // 2. SAVE_STATE 5 |
| 1387 // 3. body of optional block | 1404 // 3. body of optional block |
| 1388 // 4. LD_SP loc | 1405 // 4. LD_SP loc |
| 1389 // 5. ... | 1406 // 5. ... |
| 1390 // | 1407 // |
| 1391 { | 1408 { |
| 1392 // Reserve two slots at the top of the block. | 1409 // Reserve two slots at the top of the block. |
| 1393 int32_t topLoc = blockTopLoc(TRUE); | 1410 int32_t topLoc = blockTopLoc(TRUE); |
| 1394 insertOp(topLoc); | 1411 insertOp(topLoc); |
| 1395 | 1412 |
| 1396 // Emit the STO_SP | 1413 // Emit the STO_SP |
| 1397 int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. | 1414 int32_t stoLoc = fRXPat->fDataSize; |
| 1415 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 1398 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | 1416 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 1399 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1417 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1400 | 1418 |
| 1401 // Emit the SAVE_STATE | 1419 // Emit the SAVE_STATE |
| 1402 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 1420 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 1403 op = URX_BUILD(URX_STATE_SAVE, continueLoc); | 1421 op = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| 1404 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1422 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 1405 | 1423 |
| 1406 // Emit the LD_SP | 1424 // Emit the LD_SP |
| 1407 op = URX_BUILD(URX_LD_SP, stoLoc); | 1425 op = URX_BUILD(URX_LD_SP, stoLoc); |
| 1408 appendOp(op); | 1426 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1409 } | 1427 } |
| 1410 break; | 1428 break; |
| 1411 | 1429 |
| 1412 | 1430 |
| 1413 case doBeginMatchMode: | 1431 case doBeginMatchMode: |
| 1414 fNewModeFlags = fModeFlags; | 1432 fNewModeFlags = fModeFlags; |
| 1415 fSetModeFlag = TRUE; | 1433 fSetModeFlag = TRUE; |
| 1416 break; | 1434 break; |
| 1417 | 1435 |
| 1418 case doMatchMode: // (?i) and similar | 1436 case doMatchMode: // (?i) and similar |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1455 // We've got a (?i: or similar. Begin a parenthesized block, save old | 1473 // We've got a (?i: or similar. Begin a parenthesized block, save old |
| 1456 // mode flags so they can be restored at the close of the block. | 1474 // mode flags so they can be restored at the close of the block. |
| 1457 // | 1475 // |
| 1458 // Compile to a | 1476 // Compile to a |
| 1459 // - NOP, which later may be replaced by a save-state if the | 1477 // - NOP, which later may be replaced by a save-state if the |
| 1460 // parenthesized group gets a * quantifier, followed by | 1478 // parenthesized group gets a * quantifier, followed by |
| 1461 // - NOP, which may later be replaced by a save-state if there | 1479 // - NOP, which may later be replaced by a save-state if there |
| 1462 // is an '|' alternation within the parens. | 1480 // is an '|' alternation within the parens. |
| 1463 { | 1481 { |
| 1464 fixLiterals(FALSE); | 1482 fixLiterals(FALSE); |
| 1465 appendOp(URX_BUILD(URX_NOP, 0)); | 1483 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 1466 appendOp(URX_BUILD(URX_NOP, 0)); | 1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 1467 | 1485 |
| 1468 // On the Parentheses stack, start a new frame and add the postions | 1486 // On the Parentheses stack, start a new frame and add the postions |
| 1469 // of the two NOPs (a normal non-capturing () frame, except for th
e | 1487 // of the two NOPs (a normal non-capturing () frame, except for th
e |
| 1470 // saving of the orignal mode flags.) | 1488 // saving of the orignal mode flags.) |
| 1471 fParenStack.push(fModeFlags, *fStatus); | 1489 fParenStack.push(fModeFlags, *fStatus); |
| 1472 fParenStack.push(flags, *fStatus); // Fra
me Marker | 1490 fParenStack.push(flags, *fStatus); // Fra
me Marker |
| 1473 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP | 1491 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP |
| 1474 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 1492 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
| 1475 | 1493 |
| 1476 // Set the current mode flags to the new values. | 1494 // Set the current mode flags to the new values. |
| (...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1617 // Finished a complete set expression, including all nested sets. | 1635 // Finished a complete set expression, including all nested sets. |
| 1618 // The close bracket has already triggered clearing out pending set op
erators, | 1636 // The close bracket has already triggered clearing out pending set op
erators, |
| 1619 // the operator stack should be empty and the operand stack should ha
ve just | 1637 // the operator stack should be empty and the operand stack should ha
ve just |
| 1620 // one entry, the result set. | 1638 // one entry, the result set. |
| 1621 U_ASSERT(fSetOpStack.empty()); | 1639 U_ASSERT(fSetOpStack.empty()); |
| 1622 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); | 1640 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); |
| 1623 U_ASSERT(fSetStack.empty()); | 1641 U_ASSERT(fSetStack.empty()); |
| 1624 compileSet(theSet); | 1642 compileSet(theSet); |
| 1625 break; | 1643 break; |
| 1626 } | 1644 } |
| 1627 | 1645 |
| 1628 case doSetIntersection2: | 1646 case doSetIntersection2: |
| 1629 // Have scanned something like [abc&& | 1647 // Have scanned something like [abc&& |
| 1630 setPushOp(setIntersection2); | 1648 setPushOp(setIntersection2); |
| 1631 break; | 1649 break; |
| 1632 | 1650 |
| 1633 case doSetLiteral: | 1651 case doSetLiteral: |
| 1634 // Union the just-scanned literal character into the set being built. | 1652 // Union the just-scanned literal character into the set being built. |
| 1635 // This operation is the highest precedence set operation, so we can
always do | 1653 // This operation is the highest precedence set operation, so we can
always do |
| 1636 // it immediately, without waiting to see what follows. It is necess
ary to perform | 1654 // it immediately, without waiting to see what follows. It is necess
ary to perform |
| 1637 // any pending '-' or '&' operation first, because these have the sam
e precedence | 1655 // any pending '-' or '&' operation first, because these have the sam
e precedence |
| 1638 // as union-ing in a literal' | 1656 // as union-ing in a literal' |
| 1639 { | 1657 { |
| 1640 setEval(setUnion); | 1658 setEval(setUnion); |
| 1641 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); | 1659 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1642 s->add(fC.fChar); | 1660 s->add(fC.fChar); |
| 1643 fLastSetLiteral = fC.fChar; | 1661 fLastSetLiteral = fC.fChar; |
| 1644 break; | 1662 break; |
| 1645 } | 1663 } |
| 1646 | 1664 |
| 1647 case doSetLiteralEscaped: | 1665 case doSetLiteralEscaped: |
| 1648 // A back-slash escaped literal character was encountered. | 1666 // A back-slash escaped literal character was encountered. |
| (...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1723 case doSetPosixProp: | 1741 case doSetPosixProp: |
| 1724 { | 1742 { |
| 1725 UnicodeSet *s = scanPosixProp(); | 1743 UnicodeSet *s = scanPosixProp(); |
| 1726 if (s != NULL) { | 1744 if (s != NULL) { |
| 1727 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); | 1745 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); |
| 1728 tos->addAll(*s); | 1746 tos->addAll(*s); |
| 1729 delete s; | 1747 delete s; |
| 1730 } // else error. scanProp() reported the error status already. | 1748 } // else error. scanProp() reported the error status already. |
| 1731 } | 1749 } |
| 1732 break; | 1750 break; |
| 1733 | 1751 |
| 1734 case doSetProp: | 1752 case doSetProp: |
| 1735 // Scanned a \p \P within [brackets]. | 1753 // Scanned a \p \P within [brackets]. |
| 1736 { | 1754 { |
| 1737 UnicodeSet *s = scanProp(); | 1755 UnicodeSet *s = scanProp(); |
| 1738 if (s != NULL) { | 1756 if (s != NULL) { |
| 1739 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); | 1757 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); |
| 1740 tos->addAll(*s); | 1758 tos->addAll(*s); |
| 1741 delete s; | 1759 delete s; |
| 1742 } // else error. scanProp() reported the error status already. | 1760 } // else error. scanProp() reported the error status already. |
| 1743 } | 1761 } |
| 1744 break; | 1762 break; |
| 1745 | 1763 |
| 1746 | 1764 |
| 1747 case doSetRange: | 1765 case doSetRange: |
| 1748 // We have scanned literal-literal. Add the range to the set. | 1766 // We have scanned literal-literal. Add the range to the set. |
| 1749 // The left character is already in the set, and is saved in fLastSetLit
eral. | 1767 // The left character is already in the set, and is saved in fLastSetLit
eral. |
| 1750 // The right side is the current character. | 1768 // The right side is the current character. |
| 1751 // Lower Limit > Upper limit being an error matches both Java | 1769 // Lower Limit > Upper limit being an error matches both Java |
| 1752 // and ICU UnicodeSet behavior. | 1770 // and ICU UnicodeSet behavior. |
| 1753 { | 1771 { |
| 1754 if (fLastSetLiteral > fC.fChar) { | 1772 if (fLastSetLiteral > fC.fChar) { |
| 1755 error(U_REGEX_INVALID_RANGE); | 1773 error(U_REGEX_INVALID_RANGE); |
| 1756 } | 1774 } |
| 1757 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); | 1775 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
| 1758 s->add(fLastSetLiteral, fC.fChar); | 1776 s->add(fLastSetLiteral, fC.fChar); |
| 1759 break; | 1777 break; |
| 1760 } | 1778 } |
| 1761 | 1779 |
| 1762 default: | 1780 default: |
| 1763 U_ASSERT(FALSE); | 1781 U_ASSERT(FALSE); |
| 1764 error(U_REGEX_INTERNAL_ERROR); | 1782 error(U_REGEX_INTERNAL_ERROR); |
| 1765 break; | 1783 break; |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1804 | 1822 |
| 1805 // If no literal characters have been scanned but not yet had code generated | 1823 // If no literal characters have been scanned but not yet had code generated |
| 1806 // for them, nothing needs to be done. | 1824 // for them, nothing needs to be done. |
| 1807 if (fLiteralChars.length() == 0) { | 1825 if (fLiteralChars.length() == 0) { |
| 1808 return; | 1826 return; |
| 1809 } | 1827 } |
| 1810 | 1828 |
| 1811 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); | 1829 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); |
| 1812 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1830 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
| 1813 | 1831 |
| 1814 // Split: We need to ensure that the last item in the compiled pattern | 1832 // Split: We need to ensure that the last item in the compiled pattern |
| 1815 // refers only to the last literal scanned in the pattern, so that | 1833 // refers only to the last literal scanned in the pattern, so that |
| 1816 // quantifiers (*, +, etc.) affect only it, and not a longer string. | 1834 // quantifiers (*, +, etc.) affect only it, and not a longer string. |
| 1817 // Split before case folding for case insensitive matches. | 1835 // Split before case folding for case insensitive matches. |
| 1818 | 1836 |
| 1819 if (split) { | 1837 if (split) { |
| 1820 fLiteralChars.truncate(indexOfLastCodePoint); | 1838 fLiteralChars.truncate(indexOfLastCodePoint); |
| 1821 fixLiterals(FALSE); // Recursive call, emit code to match the first pa
rt of the string. | 1839 fixLiterals(FALSE); // Recursive call, emit code to match the first pa
rt of the string. |
| 1822 // Note that the truncated literal string may be
empty, in which case | 1840 // Note that the truncated literal string may be
empty, in which case |
| 1823 // nothing will be emitted. | 1841 // nothing will be emitted. |
| 1824 | 1842 |
| 1825 literalChar(lastCodePoint); // Re-add the last code point as if it were
a new literal. | 1843 literalChar(lastCodePoint); // Re-add the last code point as if it were
a new literal. |
| 1826 fixLiterals(FALSE); // Second recursive call, code for the fina
l code point. | 1844 fixLiterals(FALSE); // Second recursive call, code for the fina
l code point. |
| 1827 return; | 1845 return; |
| 1828 } | 1846 } |
| 1829 | 1847 |
| 1830 // If we are doing case-insensitive matching, case fold the string. This ma
y expand | 1848 // If we are doing case-insensitive matching, case fold the string. This ma
y expand |
| 1831 // the string, e.g. the German sharp-s turns into "ss" | 1849 // the string, e.g. the German sharp-s turns into "ss" |
| 1832 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1850 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1833 fLiteralChars.foldCase(); | 1851 fLiteralChars.foldCase(); |
| 1834 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); | 1852 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); |
| 1835 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1853 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
| 1836 } | 1854 } |
| 1837 | 1855 |
| 1838 if (indexOfLastCodePoint == 0) { | 1856 if (indexOfLastCodePoint == 0) { |
| 1839 // Single character, emit a URX_ONECHAR op to match it. | 1857 // Single character, emit a URX_ONECHAR op to match it. |
| 1840 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | 1858 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && |
| 1841 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { | 1859 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { |
| 1842 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); | 1860 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); |
| 1843 } else { | 1861 } else { |
| 1844 op = URX_BUILD(URX_ONECHAR, lastCodePoint); | 1862 op = URX_BUILD(URX_ONECHAR, lastCodePoint); |
| 1845 } | 1863 } |
| 1846 appendOp(op); | 1864 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1847 } else { | 1865 } else { |
| 1848 // Two or more chars, emit a URX_STRING to match them. | 1866 // Two or more chars, emit a URX_STRING to match them. |
| 1849 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1867 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1850 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); | 1868 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); |
| 1851 } else { | 1869 } else { |
| 1852 // TODO here: add optimization to split case sensitive strings of l
ength two | 1870 // TODO here: add optimization to split case sensitive strings of l
ength two |
| 1853 // into two single char ops, for efficiency. | 1871 // into two single char ops, for efficiency. |
| 1854 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); | 1872 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); |
| 1855 } | 1873 } |
| 1856 appendOp(op); | 1874 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1857 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); | 1875 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); |
| 1858 appendOp(op); | 1876 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1859 | 1877 |
| 1860 // Add this string into the accumulated strings of the compiled pattern. | 1878 // Add this string into the accumulated strings of the compiled pattern. |
| 1861 // The total size of the accumulated strings must be restricted to 24 bi
ts because | |
| 1862 // string indexes appear as compiled pattern operand values. | |
| 1863 // This is the only place that the pattern.fLiteralText string is modifi
ed. | |
| 1864 | |
| 1865 fRXPat->fLiteralText.append(fLiteralChars); | 1879 fRXPat->fLiteralText.append(fLiteralChars); |
| 1866 if (U_SUCCESS(*fStatus) && fRXPat->fLiteralText.length() > 0x00ffffff) { | |
| 1867 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
| 1868 } | |
| 1869 } | 1880 } |
| 1870 | 1881 |
| 1871 fLiteralChars.remove(); | 1882 fLiteralChars.remove(); |
| 1872 } | 1883 } |
| 1873 | 1884 |
| 1874 | 1885 |
| 1875 //------------------------------------------------------------------------------ | 1886 |
| 1876 // | 1887 |
| 1877 // appendOp() Append a new instruction onto the compiled pattern | |
| 1878 // Includes error checking, limiting the size of the | |
| 1879 // pattern to lengths that can be represented in the | |
| 1880 // 24 bit operand field of an instruction. | |
| 1881 // | |
| 1882 //------------------------------------------------------------------------------ | |
| 1883 void RegexCompile::appendOp(int32_t op) { | |
| 1884 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1885 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { | |
| 1886 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
| 1887 } | |
| 1888 } | |
| 1889 | 1888 |
| 1890 | 1889 |
| 1891 //------------------------------------------------------------------------------ | 1890 //------------------------------------------------------------------------------ |
| 1892 // | 1891 // |
| 1893 // insertOp() Insert a slot for a new opcode into the already | 1892 // insertOp() Insert a slot for a new opcode into the already |
| 1894 // compiled pattern code. | 1893 // compiled pattern code. |
| 1895 // | 1894 // |
| 1896 // Fill the slot with a NOP. Our caller will replace i
t | 1895 // Fill the slot with a NOP. Our caller will replace i
t |
| 1897 // with what they really wanted. | 1896 // with what they really wanted. |
| 1898 // | 1897 // |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1940 | 1939 |
| 1941 if (fMatchCloseParen > where) { | 1940 if (fMatchCloseParen > where) { |
| 1942 fMatchCloseParen++; | 1941 fMatchCloseParen++; |
| 1943 } | 1942 } |
| 1944 if (fMatchOpenParen > where) { | 1943 if (fMatchOpenParen > where) { |
| 1945 fMatchOpenParen++; | 1944 fMatchOpenParen++; |
| 1946 } | 1945 } |
| 1947 } | 1946 } |
| 1948 | 1947 |
| 1949 | 1948 |
| 1949 |
| 1950 //------------------------------------------------------------------------------ | 1950 //------------------------------------------------------------------------------ |
| 1951 // | 1951 // |
| 1952 // allocateData() Allocate storage in the matcher's static data area. | |
| 1953 // Return the index for the newly allocated data. | |
| 1954 // The storage won't actually exist until we are running
a match | |
| 1955 // operation, but the storage indexes are inserted into
various | |
| 1956 // opcodes while compiling the pattern. | |
| 1957 // | |
| 1958 //------------------------------------------------------------------------------ | |
| 1959 int32_t RegexCompile::allocateData(int32_t size) { | |
| 1960 if (U_FAILURE(*fStatus)) { | |
| 1961 return 0; | |
| 1962 } | |
| 1963 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { | |
| 1964 *fStatus = U_REGEX_INTERNAL_ERROR; | |
| 1965 return 0; | |
| 1966 } | |
| 1967 int32_t dataIndex = fRXPat->fDataSize; | |
| 1968 fRXPat->fDataSize += size; | |
| 1969 if (fRXPat->fDataSize >= 0x00fffff0) { | |
| 1970 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
| 1971 } | |
| 1972 return dataIndex; | |
| 1973 } | |
| 1974 | |
| 1975 | |
| 1976 //------------------------------------------------------------------------------ | |
| 1977 // | |
| 1978 // allocateStackData() Allocate space in the back-tracking stack frame. | |
| 1979 // Return the index for the newly allocated data. | |
| 1980 // The frame indexes are inserted into various | |
| 1981 // opcodes while compiling the pattern, meaning that fra
me | |
| 1982 // size must be restricted to the size that will fit | |
| 1983 // as an operand (24 bits). | |
| 1984 // | |
| 1985 //------------------------------------------------------------------------------ | |
| 1986 int32_t RegexCompile::allocateStackData(int32_t size) { | |
| 1987 if (U_FAILURE(*fStatus)) { | |
| 1988 return 0; | |
| 1989 } | |
| 1990 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { | |
| 1991 *fStatus = U_REGEX_INTERNAL_ERROR; | |
| 1992 return 0; | |
| 1993 } | |
| 1994 int32_t dataIndex = fRXPat->fFrameSize; | |
| 1995 fRXPat->fFrameSize += size; | |
| 1996 if (fRXPat->fFrameSize >= 0x00fffff0) { | |
| 1997 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
| 1998 } | |
| 1999 return dataIndex; | |
| 2000 } | |
| 2001 | |
| 2002 | |
| 2003 //------------------------------------------------------------------------------ | |
| 2004 // | |
| 2005 // blockTopLoc() Find or create a location in the compiled pattern | 1952 // blockTopLoc() Find or create a location in the compiled pattern |
| 2006 // at the start of the operation or block that has | 1953 // at the start of the operation or block that has |
| 2007 // just been compiled. Needed when a quantifier (* or | 1954 // just been compiled. Needed when a quantifier (* or |
| 2008 // whatever) appears, and we need to add an operation | 1955 // whatever) appears, and we need to add an operation |
| 2009 // at the start of the thing being quantified. | 1956 // at the start of the thing being quantified. |
| 2010 // | 1957 // |
| 2011 // (Parenthesized Blocks) have a slot with a NOP that | 1958 // (Parenthesized Blocks) have a slot with a NOP that |
| 2012 // is reserved for this purpose. .* or similar don't | 1959 // is reserved for this purpose. .* or similar don't |
| 2013 // and a slot needs to be added. | 1960 // and a slot needs to be added. |
| 2014 // | 1961 // |
| (...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2110 // Capturing Parentheses. | 2057 // Capturing Parentheses. |
| 2111 // Insert a End Capture op into the pattern. | 2058 // Insert a End Capture op into the pattern. |
| 2112 // The frame offset of the variables for this cg is obtained from the | 2059 // The frame offset of the variables for this cg is obtained from the |
| 2113 // start capture op and put it into the end-capture op. | 2060 // start capture op and put it into the end-capture op. |
| 2114 { | 2061 { |
| 2115 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); | 2062 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); |
| 2116 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | 2063 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
| 2117 | 2064 |
| 2118 int32_t frameVarLocation = URX_VAL(captureOp); | 2065 int32_t frameVarLocation = URX_VAL(captureOp); |
| 2119 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); | 2066 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); |
| 2120 appendOp(endCaptureOp); | 2067 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); |
| 2121 } | 2068 } |
| 2122 break; | 2069 break; |
| 2123 case atomic: | 2070 case atomic: |
| 2124 // Atomic Parenthesis. | 2071 // Atomic Parenthesis. |
| 2125 // Insert a LD_SP operation to restore the state stack to the position | 2072 // Insert a LD_SP operation to restore the state stack to the position |
| 2126 // it was when the atomic parens were entered. | 2073 // it was when the atomic parens were entered. |
| 2127 { | 2074 { |
| 2128 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); | 2075 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); |
| 2129 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | 2076 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); |
| 2130 int32_t stoLoc = URX_VAL(stoOp); | 2077 int32_t stoLoc = URX_VAL(stoOp); |
| 2131 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); | 2078 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); |
| 2132 appendOp(ldOp); | 2079 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); |
| 2133 } | 2080 } |
| 2134 break; | 2081 break; |
| 2135 | 2082 |
| 2136 case lookAhead: | 2083 case lookAhead: |
| 2137 { | 2084 { |
| 2138 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2085 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
| 2139 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2086 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 2140 int32_t dataLoc = URX_VAL(startOp); | 2087 int32_t dataLoc = URX_VAL(startOp); |
| 2141 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2088 int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
| 2142 appendOp(op); | 2089 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2143 } | 2090 } |
| 2144 break; | 2091 break; |
| 2145 | 2092 |
| 2146 case negLookAhead: | 2093 case negLookAhead: |
| 2147 { | 2094 { |
| 2148 // See comment at doOpenLookAheadNeg | 2095 // See comment at doOpenLookAheadNeg |
| 2149 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); | 2096 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); |
| 2150 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2097 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 2151 int32_t dataLoc = URX_VAL(startOp); | 2098 int32_t dataLoc = URX_VAL(startOp); |
| 2152 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2099 int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
| 2153 appendOp(op); | 2100 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2154 op = URX_BUILD(URX_BACKTRACK, 0); | 2101 op = URX_BUILD(URX_BACKTRACK, 0); |
| 2155 appendOp(op); | 2102 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2156 op = URX_BUILD(URX_LA_END, dataLoc); | 2103 op = URX_BUILD(URX_LA_END, dataLoc); |
| 2157 appendOp(op); | 2104 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2158 | 2105 |
| 2159 // Patch the URX_SAVE near the top of the block. | 2106 // Patch the URX_SAVE near the top of the block. |
| 2160 // The destination of the SAVE is the final LA_END that was just add
ed. | 2107 // The destination of the SAVE is the final LA_END that was just add
ed. |
| 2161 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); | 2108 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); |
| 2162 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | 2109 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); |
| 2163 int32_t dest = fRXPat->fCompiledPat->size()-1; | 2110 int32_t dest = fRXPat->fCompiledPat->size()-1; |
| 2164 saveOp = URX_BUILD(URX_STATE_SAVE, dest); | 2111 saveOp = URX_BUILD(URX_STATE_SAVE, dest); |
| 2165 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | 2112 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); |
| 2166 } | 2113 } |
| 2167 break; | 2114 break; |
| 2168 | 2115 |
| 2169 case lookBehind: | 2116 case lookBehind: |
| 2170 { | 2117 { |
| 2171 // See comment at doOpenLookBehind. | 2118 // See comment at doOpenLookBehind. |
| 2172 | 2119 |
| 2173 // Append the URX_LB_END and URX_LA_END to the compiled pattern. | 2120 // Append the URX_LB_END and URX_LA_END to the compiled pattern. |
| 2174 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); | 2121 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); |
| 2175 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2122 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 2176 int32_t dataLoc = URX_VAL(startOp); | 2123 int32_t dataLoc = URX_VAL(startOp); |
| 2177 int32_t op = URX_BUILD(URX_LB_END, dataLoc); | 2124 int32_t op = URX_BUILD(URX_LB_END, dataLoc); |
| 2178 appendOp(op); | 2125 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2179 op = URX_BUILD(URX_LA_END, dataLoc); | 2126 op = URX_BUILD(URX_LA_END, dataLoc); |
| 2180 appendOp(op); | 2127 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2181 | 2128 |
| 2182 // Determine the min and max bounds for the length of the | 2129 // Determine the min and max bounds for the length of the |
| 2183 // string that the pattern can match. | 2130 // string that the pattern can match. |
| 2184 // An unbounded upper limit is an error. | 2131 // An unbounded upper limit is an error. |
| 2185 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2132 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
| 2186 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2133 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
| 2187 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2134 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2188 if (URX_TYPE(maxML) != 0) { | |
| 2189 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
| 2190 break; | |
| 2191 } | |
| 2192 if (maxML == INT32_MAX) { | 2135 if (maxML == INT32_MAX) { |
| 2193 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2136 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2194 break; | 2137 break; |
| 2195 } | 2138 } |
| 2196 U_ASSERT(minML <= maxML); | 2139 U_ASSERT(minML <= maxML); |
| 2197 | 2140 |
| 2198 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2141 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
| 2199 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2142 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
| 2200 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); | 2143 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); |
| 2201 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); | 2144 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); |
| 2202 | 2145 |
| 2203 } | 2146 } |
| 2204 break; | 2147 break; |
| 2205 | 2148 |
| 2206 | 2149 |
| 2207 | 2150 |
| 2208 case lookBehindN: | 2151 case lookBehindN: |
| 2209 { | 2152 { |
| 2210 // See comment at doOpenLookBehindNeg. | 2153 // See comment at doOpenLookBehindNeg. |
| 2211 | 2154 |
| 2212 // Append the URX_LBN_END to the compiled pattern. | 2155 // Append the URX_LBN_END to the compiled pattern. |
| 2213 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2156 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
| 2214 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2157 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 2215 int32_t dataLoc = URX_VAL(startOp); | 2158 int32_t dataLoc = URX_VAL(startOp); |
| 2216 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); | 2159 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); |
| 2217 appendOp(op); | 2160 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2218 | 2161 |
| 2219 // Determine the min and max bounds for the length of the | 2162 // Determine the min and max bounds for the length of the |
| 2220 // string that the pattern can match. | 2163 // string that the pattern can match. |
| 2221 // An unbounded upper limit is an error. | 2164 // An unbounded upper limit is an error. |
| 2222 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2165 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
| 2223 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2166 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
| 2224 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2167 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2225 if (URX_TYPE(maxML) != 0) { | |
| 2226 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
| 2227 break; | |
| 2228 } | |
| 2229 if (maxML == INT32_MAX) { | 2168 if (maxML == INT32_MAX) { |
| 2230 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2169 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2231 break; | 2170 break; |
| 2232 } | 2171 } |
| 2233 U_ASSERT(minML <= maxML); | 2172 U_ASSERT(minML <= maxML); |
| 2234 | 2173 |
| 2235 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2174 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
| 2236 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2175 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
| 2237 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); | 2176 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); |
| 2238 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); | 2177 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2273 // There shoudn't be any, but just in case. | 2212 // There shoudn't be any, but just in case. |
| 2274 // (Case Closure can add them; if we had a simple case closure avaialble
that | 2213 // (Case Closure can add them; if we had a simple case closure avaialble
that |
| 2275 // ignored strings, that would be better.) | 2214 // ignored strings, that would be better.) |
| 2276 theSet->removeAllStrings(); | 2215 theSet->removeAllStrings(); |
| 2277 int32_t setSize = theSet->size(); | 2216 int32_t setSize = theSet->size(); |
| 2278 | 2217 |
| 2279 switch (setSize) { | 2218 switch (setSize) { |
| 2280 case 0: | 2219 case 0: |
| 2281 { | 2220 { |
| 2282 // Set of no elements. Always fails to match. | 2221 // Set of no elements. Always fails to match. |
| 2283 appendOp(URX_BUILD(URX_BACKTRACK, 0)); | 2222 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStat
us); |
| 2284 delete theSet; | 2223 delete theSet; |
| 2285 } | 2224 } |
| 2286 break; | 2225 break; |
| 2287 | 2226 |
| 2288 case 1: | 2227 case 1: |
| 2289 { | 2228 { |
| 2290 // The set contains only a single code point. Put it into | 2229 // The set contains only a single code point. Put it into |
| 2291 // the compiled pattern as a single char operation rather | 2230 // the compiled pattern as a single char operation rather |
| 2292 // than a set, and discard the set itself. | 2231 // than a set, and discard the set itself. |
| 2293 literalChar(theSet->charAt(0)); | 2232 literalChar(theSet->charAt(0)); |
| 2294 delete theSet; | 2233 delete theSet; |
| 2295 } | 2234 } |
| 2296 break; | 2235 break; |
| 2297 | 2236 |
| 2298 default: | 2237 default: |
| 2299 { | 2238 { |
| 2300 // The set contains two or more chars. (the normal case) | 2239 // The set contains two or more chars. (the normal case) |
| 2301 // Put it into the compiled pattern as a set. | 2240 // Put it into the compiled pattern as a set. |
| 2302 int32_t setNumber = fRXPat->fSets->size(); | 2241 int32_t setNumber = fRXPat->fSets->size(); |
| 2303 fRXPat->fSets->addElement(theSet, *fStatus); | 2242 fRXPat->fSets->addElement(theSet, *fStatus); |
| 2304 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | 2243 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); |
| 2305 appendOp(setOp); | 2244 fRXPat->fCompiledPat->addElement(setOp, *fStatus); |
| 2306 } | 2245 } |
| 2307 } | 2246 } |
| 2308 } | 2247 } |
| 2309 | 2248 |
| 2310 | 2249 |
| 2311 //------------------------------------------------------------------------------ | 2250 //------------------------------------------------------------------------------ |
| 2312 // | 2251 // |
| 2313 // compileInterval Generate the code for a {min, max} style interval quanti
fier. | 2252 // compileInterval Generate the code for a {min, max} style interval quanti
fier. |
| 2314 // Except for the specific opcodes used, the code is the sa
me | 2253 // Except for the specific opcodes used, the code is the sa
me |
| 2315 // for all three types (greedy, non-greedy, possessive) of | 2254 // for all three types (greedy, non-greedy, possessive) of |
| (...skipping 18 matching lines...) Expand all Loading... |
| 2334 int32_t topOfBlock = blockTopLoc(TRUE); | 2273 int32_t topOfBlock = blockTopLoc(TRUE); |
| 2335 insertOp(topOfBlock); | 2274 insertOp(topOfBlock); |
| 2336 insertOp(topOfBlock); | 2275 insertOp(topOfBlock); |
| 2337 insertOp(topOfBlock); | 2276 insertOp(topOfBlock); |
| 2338 | 2277 |
| 2339 // The operands for the CTR_INIT opcode include the index in the matcher dat
a | 2278 // The operands for the CTR_INIT opcode include the index in the matcher dat
a |
| 2340 // of the counter. Allocate it now. There are two data items | 2279 // of the counter. Allocate it now. There are two data items |
| 2341 // counterLoc --> Loop counter | 2280 // counterLoc --> Loop counter |
| 2342 // +1 --> Input index (for breaking non-progressing loops) | 2281 // +1 --> Input index (for breaking non-progressing loops) |
| 2343 // (Only present if unbounded upper limit on loop) | 2282 // (Only present if unbounded upper limit on loop) |
| 2344 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; | 2283 int32_t counterLoc = fRXPat->fFrameSize; |
| 2345 int32_t counterLoc = allocateStackData(dataSize); | 2284 fRXPat->fFrameSize++; |
| 2285 if (fIntervalUpper < 0) { |
| 2286 fRXPat->fFrameSize++; |
| 2287 } |
| 2346 | 2288 |
| 2347 int32_t op = URX_BUILD(InitOp, counterLoc); | 2289 int32_t op = URX_BUILD(InitOp, counterLoc); |
| 2348 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | 2290 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); |
| 2349 | 2291 |
| 2350 // The second operand of CTR_INIT is the location following the end of the l
oop. | 2292 // The second operand of CTR_INIT is the location following the end of the l
oop. |
| 2351 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the | 2293 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the |
| 2352 // compilation of something later on causes the code to grow and the targe
t | 2294 // compilation of something later on causes the code to grow and the targe
t |
| 2353 // position to move. | 2295 // position to move. |
| 2354 int32_t loopEnd = fRXPat->fCompiledPat->size(); | 2296 int32_t loopEnd = fRXPat->fCompiledPat->size(); |
| 2355 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | 2297 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); |
| 2356 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | 2298 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); |
| 2357 | 2299 |
| 2358 // Followed by the min and max counts. | 2300 // Followed by the min and max counts. |
| 2359 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); | 2301 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); |
| 2360 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); | 2302 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); |
| 2361 | 2303 |
| 2362 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. | 2304 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. |
| 2363 // Goes at end of the block being looped over, so just append to the code
so far. | 2305 // Goes at end of the block being looped over, so just append to the code
so far. |
| 2364 op = URX_BUILD(LoopOp, topOfBlock); | 2306 op = URX_BUILD(LoopOp, topOfBlock); |
| 2365 appendOp(op); | 2307 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2366 | 2308 |
| 2367 if ((fIntervalLow & 0xff000000) != 0 || | 2309 if ((fIntervalLow & 0xff000000) != 0 || |
| 2368 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { | 2310 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { |
| 2369 error(U_REGEX_NUMBER_TOO_BIG); | 2311 error(U_REGEX_NUMBER_TOO_BIG); |
| 2370 } | 2312 } |
| 2371 | 2313 |
| 2372 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { | 2314 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { |
| 2373 error(U_REGEX_MAX_LT_MIN); | 2315 error(U_REGEX_MAX_LT_MIN); |
| 2374 } | 2316 } |
| 2375 } | 2317 } |
| 2376 | 2318 |
| 2377 | 2319 |
| 2378 | 2320 |
| 2379 UBool RegexCompile::compileInlineInterval() { | 2321 UBool RegexCompile::compileInlineInterval() { |
| 2380 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { | 2322 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { |
| 2381 // Too big to inline. Fail, which will cause looping code to be generat
ed. | 2323 // Too big to inline. Fail, which will cause looping code to be generat
ed. |
| 2382 // (Upper < Lower picks up unbounded upper and errors, both.) | 2324 // (Upper < Lower picks up unbounded upper and errors, both.) |
| 2383 return FALSE; | 2325 return FALSE; |
| 2384 } | 2326 } |
| 2385 | 2327 |
| 2386 int32_t topOfBlock = blockTopLoc(FALSE); | 2328 int32_t topOfBlock = blockTopLoc(FALSE); |
| 2387 if (fIntervalUpper == 0) { | 2329 if (fIntervalUpper == 0) { |
| 2388 // Pathological case. Attempt no matches, as if the block doesn't exist
. | 2330 // Pathological case. Attempt no matches, as if the block doesn't exist
. |
| 2389 // Discard the generated code for the block. | |
| 2390 // If the block included parens, discard the info pertaining to them as
well. | |
| 2391 fRXPat->fCompiledPat->setSize(topOfBlock); | 2331 fRXPat->fCompiledPat->setSize(topOfBlock); |
| 2392 if (fMatchOpenParen >= topOfBlock) { | |
| 2393 fMatchOpenParen = -1; | |
| 2394 } | |
| 2395 if (fMatchCloseParen >= topOfBlock) { | |
| 2396 fMatchCloseParen = -1; | |
| 2397 } | |
| 2398 return TRUE; | 2332 return TRUE; |
| 2399 } | 2333 } |
| 2400 | 2334 |
| 2401 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { | 2335 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { |
| 2402 // The thing being repeated is not a single op, but some | 2336 // The thing being repeated is not a single op, but some |
| 2403 // more complex block. Do it as a loop, not inlines. | 2337 // more complex block. Do it as a loop, not inlines. |
| 2404 // Note that things "repeated" a max of once are handled as inline, be
cause | 2338 // Note that things "repeated" a max of once are handled as inline, be
cause |
| 2405 // the one copy of the code already generated is just fine. | 2339 // the one copy of the code already generated is just fine. |
| 2406 return FALSE; | 2340 return FALSE; |
| 2407 } | 2341 } |
| (...skipping 14 matching lines...) Expand all Loading... |
| 2422 } | 2356 } |
| 2423 | 2357 |
| 2424 | 2358 |
| 2425 | 2359 |
| 2426 // Loop, emitting the op for the thing being repeated each time. | 2360 // Loop, emitting the op for the thing being repeated each time. |
| 2427 // Loop starts at 1 because one instance of the op already exists in the
pattern, | 2361 // Loop starts at 1 because one instance of the op already exists in the
pattern, |
| 2428 // it was put there when it was originally encountered. | 2362 // it was put there when it was originally encountered. |
| 2429 int32_t i; | 2363 int32_t i; |
| 2430 for (i=1; i<fIntervalUpper; i++ ) { | 2364 for (i=1; i<fIntervalUpper; i++ ) { |
| 2431 if (i == fIntervalLow) { | 2365 if (i == fIntervalLow) { |
| 2432 appendOp(saveOp); | 2366 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
| 2433 } | 2367 } |
| 2434 if (i > fIntervalLow) { | 2368 if (i > fIntervalLow) { |
| 2435 appendOp(saveOp); | 2369 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
| 2436 } | 2370 } |
| 2437 appendOp(op); | 2371 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 2438 } | 2372 } |
| 2439 return TRUE; | 2373 return TRUE; |
| 2440 } | 2374 } |
| 2441 | 2375 |
| 2442 | 2376 |
| 2443 | 2377 |
| 2444 //------------------------------------------------------------------------------ | 2378 //------------------------------------------------------------------------------ |
| 2445 // | 2379 // |
| 2380 // caseInsensitiveStart given a single code point from a pattern string, dete
rmine the |
| 2381 // set of characters that could potentially begin a case
-insensitive |
| 2382 // match of a string beginning with that character, usin
g full Unicode |
| 2383 // case insensitive matching. |
| 2384 // |
| 2385 // This is used in optimizing find(). |
| 2386 // |
| 2387 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but |
| 2388 // misses cases like this: |
| 2389 // A string from the pattern begins with 'ss' (although all we know |
| 2390 // in this context is that it begins with 's') |
| 2391 // The pattern could match a string beginning with a German sharp-s |
| 2392 // |
| 2393 // To the ordinary case closure for a character c, we add all other |
| 2394 // characters cx where the case closure of cx incudes a string form th
at begins |
| 2395 // with the original character c. |
| 2396 // |
| 2397 // This function could be made smarter. The full pattern string is ava
ilable |
| 2398 // and it would be possible to verify that the extra characters being
added |
| 2399 // to the starting set fully match, rather than having just a first-ch
ar of the |
| 2400 // folded form match. |
| 2401 // |
| 2402 //------------------------------------------------------------------------------ |
| 2403 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh
ars) { |
| 2404 |
| 2405 // Machine Generated below. |
| 2406 // It may need updating with new versions of Unicode. |
| 2407 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update i
s needed. |
| 2408 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/tru
nk/unicode/c/genregexcasing |
| 2409 |
| 2410 // Machine Generated Data. Do not hand edit. |
| 2411 static const UChar32 RECaseFixCodePoints[] = { |
| 2412 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, |
| 2413 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, |
| 2414 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x
1f07, |
| 2415 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60,
0x1f61, |
| 2416 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c,
0x110000}; |
| 2417 |
| 2418 static const int16_t RECaseFixStringOffsets[] = { |
| 2419 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, |
| 2420 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, |
| 2421 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, |
| 2422 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, |
| 2423 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0}; |
| 2424 |
| 2425 static const int16_t RECaseFixCounts[] = { |
| 2426 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, |
| 2427 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, |
| 2428 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, |
| 2429 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, |
| 2430 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0}; |
| 2431 |
| 2432 static const UChar RECaseFixData[] = { |
| 2433 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0x
df, |
| 2434 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0
x1fb3, |
| 2435 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0
x1fd3, |
| 2436 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0
x1fe3, |
| 2437 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0
xfb14, |
| 2438 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a,
0x1f83, |
| 2439 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f,
0x1f90, |
| 2440 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c,
0x1f95, |
| 2441 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9,
0x1fa2, |
| 2442 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae,
0x1fa7, |
| 2443 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0}; |
| 2444 |
| 2445 // End of machine generated data. |
| 2446 |
| 2447 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 2448 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); |
| 2449 starterChars->set(caseFoldedC, caseFoldedC); |
| 2450 |
| 2451 int32_t i; |
| 2452 for (i=0; RECaseFixCodePoints[i]<c ; i++) { |
| 2453 // Simple linear search through the sorted list of interesting code
points. |
| 2454 } |
| 2455 |
| 2456 if (RECaseFixCodePoints[i] == c) { |
| 2457 int32_t dataIndex = RECaseFixStringOffsets[i]; |
| 2458 int32_t numCharsToAdd = RECaseFixCounts[i]; |
| 2459 UChar32 cpToAdd = 0; |
| 2460 for (int32_t j=0; j<numCharsToAdd; j++) { |
| 2461 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd); |
| 2462 starterChars->add(cpToAdd); |
| 2463 } |
| 2464 } |
| 2465 |
| 2466 starterChars->closeOver(USET_CASE_INSENSITIVE); |
| 2467 starterChars->removeAllStrings(); |
| 2468 } else { |
| 2469 // Not a cased character. Just return it alone. |
| 2470 starterChars->set(c, c); |
| 2471 } |
| 2472 } |
| 2473 |
| 2474 |
| 2475 |
| 2476 |
| 2477 //------------------------------------------------------------------------------ |
| 2478 // |
| 2446 // matchStartType Determine how a match can start. | 2479 // matchStartType Determine how a match can start. |
| 2447 // Used to optimize find() operations. | 2480 // Used to optimize find() operations. |
| 2448 // | 2481 // |
| 2449 // Operation is very similar to minMatchLength(). Walk the
compiled | 2482 // Operation is very similar to minMatchLength(). Walk the
compiled |
| 2450 // pattern, keeping an on-going minimum-match-length. For a
ny | 2483 // pattern, keeping an on-going minimum-match-length. For a
ny |
| 2451 // op where the min match coming in is zero, add that ops po
ssible | 2484 // op where the min match coming in is zero, add that ops po
ssible |
| 2452 // starting matches to the possible starts for the overall p
attern. | 2485 // starting matches to the possible starts for the overall p
attern. |
| 2453 // | 2486 // |
| 2454 //------------------------------------------------------------------------------ | 2487 //------------------------------------------------------------------------------ |
| 2455 void RegexCompile::matchStartType() { | 2488 void RegexCompile::matchStartType() { |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2507 case URX_BACKSLASH_G: | 2540 case URX_BACKSLASH_G: |
| 2508 case URX_BACKSLASH_Z: | 2541 case URX_BACKSLASH_Z: |
| 2509 case URX_DOLLAR: | 2542 case URX_DOLLAR: |
| 2510 case URX_DOLLAR_M: | 2543 case URX_DOLLAR_M: |
| 2511 case URX_DOLLAR_D: | 2544 case URX_DOLLAR_D: |
| 2512 case URX_DOLLAR_MD: | 2545 case URX_DOLLAR_MD: |
| 2513 case URX_RELOC_OPRND: | 2546 case URX_RELOC_OPRND: |
| 2514 case URX_STO_INP_LOC: | 2547 case URX_STO_INP_LOC: |
| 2515 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match | 2548 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match |
| 2516 case URX_BACKREF_I: | 2549 case URX_BACKREF_I: |
| 2517 | 2550 |
| 2518 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. | 2551 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. |
| 2519 case URX_LD_SP: | 2552 case URX_LD_SP: |
| 2520 break; | 2553 break; |
| 2521 | 2554 |
| 2522 case URX_CARET: | 2555 case URX_CARET: |
| 2523 if (atStart) { | 2556 if (atStart) { |
| 2524 fRXPat->fStartType = START_START; | 2557 fRXPat->fStartType = START_START; |
| 2525 } | 2558 } |
| 2526 break; | 2559 break; |
| 2527 | 2560 |
| (...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2624 currentLen++; | 2657 currentLen++; |
| 2625 atStart = FALSE; | 2658 atStart = FALSE; |
| 2626 break; | 2659 break; |
| 2627 | 2660 |
| 2628 | 2661 |
| 2629 case URX_ONECHAR_I: | 2662 case URX_ONECHAR_I: |
| 2630 // Case Insensitive Single Character. | 2663 // Case Insensitive Single Character. |
| 2631 if (currentLen == 0) { | 2664 if (currentLen == 0) { |
| 2632 UChar32 c = URX_VAL(op); | 2665 UChar32 c = URX_VAL(op); |
| 2633 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { | 2666 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 2634 | 2667 UnicodeSet starters(c, c); |
| 2635 // Disable optimizations on first char of match. | 2668 starters.closeOver(USET_CASE_INSENSITIVE); |
| 2636 // TODO: Compute the set of chars that case fold to this cha
r, or to | 2669 // findCaseInsensitiveStarters(c, &starters); |
| 2637 // a string that begins with this char. | 2670 // For ONECHAR_I, no need to worry about text chars that e
xpand on folding into strings. |
| 2638 // For simple case folding, this code worked: | 2671 // The expanded folding can't match the pattern. |
| 2639 // UnicodeSet s(c, c); | 2672 fRXPat->fInitialChars->addAll(starters); |
| 2640 // s.closeOver(USET_CASE_INSENSITIVE); | |
| 2641 // fRXPat->fInitialChars->addAll(s); | |
| 2642 | |
| 2643 fRXPat->fInitialChars->clear(); | |
| 2644 fRXPat->fInitialChars->complement(); | |
| 2645 } else { | 2673 } else { |
| 2646 // Char has no case variants. Just add it as-is to the | 2674 // Char has no case variants. Just add it as-is to the |
| 2647 // set of possible starting chars. | 2675 // set of possible starting chars. |
| 2648 fRXPat->fInitialChars->add(c); | 2676 fRXPat->fInitialChars->add(c); |
| 2649 } | 2677 } |
| 2650 numInitialStrings += 2; | 2678 numInitialStrings += 2; |
| 2651 } | 2679 } |
| 2652 currentLen++; | 2680 currentLen++; |
| 2653 atStart = FALSE; | 2681 atStart = FALSE; |
| 2654 break; | 2682 break; |
| (...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2757 loc++; | 2785 loc++; |
| 2758 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); | 2786 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); |
| 2759 int32_t stringLen = URX_VAL(stringLenOp); | 2787 int32_t stringLen = URX_VAL(stringLenOp); |
| 2760 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); | 2788 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); |
| 2761 U_ASSERT(stringLenOp >= 2); | 2789 U_ASSERT(stringLenOp >= 2); |
| 2762 if (currentLen == 0) { | 2790 if (currentLen == 0) { |
| 2763 // Add the starting character of this string to the set of p
ossible starting | 2791 // Add the starting character of this string to the set of p
ossible starting |
| 2764 // characters for this pattern. | 2792 // characters for this pattern. |
| 2765 int32_t stringStartIdx = URX_VAL(op); | 2793 int32_t stringStartIdx = URX_VAL(op); |
| 2766 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); | 2794 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); |
| 2767 UnicodeSet s(c, c); | 2795 UnicodeSet s; |
| 2768 | 2796 findCaseInsensitiveStarters(c, &s); |
| 2769 // TODO: compute correct set of starting chars for full cas
e folding. | |
| 2770 // For the moment, say any char can start. | |
| 2771 // s.closeOver(USET_CASE_INSENSITIVE); | |
| 2772 s.clear(); | |
| 2773 s.complement(); | |
| 2774 | |
| 2775 fRXPat->fInitialChars->addAll(s); | 2797 fRXPat->fInitialChars->addAll(s); |
| 2776 numInitialStrings += 2; // Matching on an initial string no
t possible. | 2798 numInitialStrings += 2; // Matching on an initial string no
t possible. |
| 2777 } | 2799 } |
| 2778 currentLen += stringLen; | 2800 currentLen += stringLen; |
| 2779 atStart = FALSE; | 2801 atStart = FALSE; |
| 2780 } | 2802 } |
| 2781 break; | 2803 break; |
| 2782 | 2804 |
| 2783 case URX_CTR_INIT: | 2805 case URX_CTR_INIT: |
| 2784 case URX_CTR_INIT_NG: | 2806 case URX_CTR_INIT_NG: |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2820 // don't change the minimum match | 2842 // don't change the minimum match |
| 2821 atStart = FALSE; | 2843 atStart = FALSE; |
| 2822 break; | 2844 break; |
| 2823 | 2845 |
| 2824 | 2846 |
| 2825 case URX_LA_START: | 2847 case URX_LA_START: |
| 2826 case URX_LB_START: | 2848 case URX_LB_START: |
| 2827 { | 2849 { |
| 2828 // Look-around. Scan forward until the matching look-ahead end, | 2850 // Look-around. Scan forward until the matching look-ahead end, |
| 2829 // without processing the look-around block. This is overly p
essimistic. | 2851 // without processing the look-around block. This is overly p
essimistic. |
| 2830 | 2852 |
| 2831 // Keep track of the nesting depth of look-around blocks. Boile
rplate code for | 2853 // Keep track of the nesting depth of look-around blocks. Boile
rplate code for |
| 2832 // lookahead contains two LA_END instructions, so count goes u
p by two | 2854 // lookahead contains two LA_END instructions, so count goes u
p by two |
| 2833 // for each LA_START. | 2855 // for each LA_START. |
| 2834 int32_t depth = (opType == URX_LA_START? 2: 1); | 2856 int32_t depth = (opType == URX_LA_START? 2: 1); |
| 2835 for (;;) { | 2857 for (;;) { |
| 2836 loc++; | 2858 loc++; |
| 2837 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); | 2859 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 2838 if (URX_TYPE(op) == URX_LA_START) { | 2860 if (URX_TYPE(op) == URX_LA_START) { |
| 2839 depth+=2; | 2861 depth+=2; |
| 2840 } | 2862 } |
| (...skipping 539 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3380 | 3402 |
| 3381 case URX_STRING_I: | 3403 case URX_STRING_I: |
| 3382 // TODO: This code assumes that any user string that matches will b
e no longer | 3404 // TODO: This code assumes that any user string that matches will b
e no longer |
| 3383 // than our compiled string, with case insensitive matching. | 3405 // than our compiled string, with case insensitive matching. |
| 3384 // Our compiled string has been case-folded already. | 3406 // Our compiled string has been case-folded already. |
| 3385 // | 3407 // |
| 3386 // Any matching user string will have no more code points tha
n our | 3408 // Any matching user string will have no more code points tha
n our |
| 3387 // compiled (folded) string. Folding may add code points, bu
t | 3409 // compiled (folded) string. Folding may add code points, bu
t |
| 3388 // not remove them. | 3410 // not remove them. |
| 3389 // | 3411 // |
| 3390 // There is a potential problem if a supplemental code point | 3412 // There is a potential problem if a supplemental code point |
| 3391 // case-folds to a BMP code point. In this case our compiled
string | 3413 // case-folds to a BMP code point. In this case our compiled
string |
| 3392 // could be shorter (in code units) than a matching user stri
ng. | 3414 // could be shorter (in code units) than a matching user stri
ng. |
| 3393 // | 3415 // |
| 3394 // At this time (Unicode 6.1) there are no such characters, a
nd this case | 3416 // At this time (Unicode 6.1) there are no such characters, a
nd this case |
| 3395 // is not being handled. A test, intltest regex/Bug9283, wil
l fail if | 3417 // is not being handled. A test, intltest regex/Bug9283, wil
l fail if |
| 3396 // any problematic characters are added to Unicode. | 3418 // any problematic characters are added to Unicode. |
| 3397 // | 3419 // |
| 3398 // If this happens, we can make a set of the BMP chars that t
he | 3420 // If this happens, we can make a set of the BMP chars that t
he |
| 3399 // troublesome supplementals fold to, scan our string, and bu
mp the | 3421 // troublesome supplementals fold to, scan our string, and bu
mp the |
| 3400 // currentLen one extra for each that is found. | 3422 // currentLen one extra for each that is found. |
| (...skipping 10 matching lines...) Expand all Loading... |
| 3411 // For Loops, recursively call this function on the pattern for the
loop body, | 3433 // For Loops, recursively call this function on the pattern for the
loop body, |
| 3412 // then multiply the result by the maximum loop count. | 3434 // then multiply the result by the maximum loop count. |
| 3413 { | 3435 { |
| 3414 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(l
oc+1)); | 3436 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(l
oc+1)); |
| 3415 if (loopEndLoc == loc+4) { | 3437 if (loopEndLoc == loc+4) { |
| 3416 // Loop has an empty body. No affect on max match length. | 3438 // Loop has an empty body. No affect on max match length. |
| 3417 // Continue processing with code after the loop end. | 3439 // Continue processing with code after the loop end. |
| 3418 loc = loopEndLoc; | 3440 loc = loopEndLoc; |
| 3419 break; | 3441 break; |
| 3420 } | 3442 } |
| 3421 | 3443 |
| 3422 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); | 3444 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); |
| 3423 if (maxLoopCount == -1) { | 3445 if (maxLoopCount == -1) { |
| 3424 // Unbounded Loop. No upper bound on match length. | 3446 // Unbounded Loop. No upper bound on match length. |
| 3425 currentLen = INT32_MAX; | 3447 currentLen = INT32_MAX; |
| 3426 break; | 3448 break; |
| 3427 } | 3449 } |
| 3428 | 3450 |
| 3429 U_ASSERT(loopEndLoc >= loc+4); | 3451 U_ASSERT(loopEndLoc >= loc+4); |
| 3430 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec
ursive call. | 3452 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec
ursive call. |
| 3431 if (blockLen == INT32_MAX) { | 3453 if (blockLen == INT32_MAX) { |
| (...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3529 // will be offset at each location in the original code. | 3551 // will be offset at each location in the original code. |
| 3530 int32_t loc; | 3552 int32_t loc; |
| 3531 int32_t d = 0; | 3553 int32_t d = 0; |
| 3532 for (loc=0; loc<end; loc++) { | 3554 for (loc=0; loc<end; loc++) { |
| 3533 deltas.addElement(d, *fStatus); | 3555 deltas.addElement(d, *fStatus); |
| 3534 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); | 3556 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
| 3535 if (URX_TYPE(op) == URX_NOP) { | 3557 if (URX_TYPE(op) == URX_NOP) { |
| 3536 d++; | 3558 d++; |
| 3537 } | 3559 } |
| 3538 } | 3560 } |
| 3539 | 3561 |
| 3540 UnicodeString caseStringBuffer; | 3562 UnicodeString caseStringBuffer; |
| 3541 | 3563 |
| 3542 // Make a second pass over the code, removing the NOPs by moving following | 3564 // Make a second pass over the code, removing the NOPs by moving following |
| 3543 // code up, and patching operands that refer to code locations that | 3565 // code up, and patching operands that refer to code locations that |
| 3544 // are being moved. The array of offsets from the first step is used | 3566 // are being moved. The array of offsets from the first step is used |
| 3545 // to compute the new operand values. | 3567 // to compute the new operand values. |
| 3546 int32_t src; | 3568 int32_t src; |
| 3547 int32_t dst = 0; | 3569 int32_t dst = 0; |
| 3548 for (src=0; src<end; src++) { | 3570 for (src=0; src<end; src++) { |
| 3549 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); | 3571 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); |
| (...skipping 26 matching lines...) Expand all Loading... |
| 3576 { | 3598 { |
| 3577 int32_t where = URX_VAL(op); | 3599 int32_t where = URX_VAL(op); |
| 3578 if (where > fRXPat->fGroupMap->size()) { | 3600 if (where > fRXPat->fGroupMap->size()) { |
| 3579 error(U_REGEX_INVALID_BACK_REF); | 3601 error(U_REGEX_INVALID_BACK_REF); |
| 3580 break; | 3602 break; |
| 3581 } | 3603 } |
| 3582 where = fRXPat->fGroupMap->elementAti(where-1); | 3604 where = fRXPat->fGroupMap->elementAti(where-1); |
| 3583 op = URX_BUILD(opType, where); | 3605 op = URX_BUILD(opType, where); |
| 3584 fRXPat->fCompiledPat->setElementAt(op, dst); | 3606 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3585 dst++; | 3607 dst++; |
| 3586 | 3608 |
| 3587 fRXPat->fNeedsAltInput = TRUE; | 3609 fRXPat->fNeedsAltInput = TRUE; |
| 3588 break; | 3610 break; |
| 3589 } | 3611 } |
| 3590 case URX_RESERVED_OP: | 3612 case URX_RESERVED_OP: |
| 3591 case URX_RESERVED_OP_N: | 3613 case URX_RESERVED_OP_N: |
| 3592 case URX_BACKTRACK: | 3614 case URX_BACKTRACK: |
| 3593 case URX_END: | 3615 case URX_END: |
| 3594 case URX_ONECHAR: | 3616 case URX_ONECHAR: |
| 3595 case URX_STRING: | 3617 case URX_STRING: |
| 3596 case URX_STRING_LEN: | 3618 case URX_STRING_LEN: |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3667 if (fLineNum > 0x7FFFFFFF) { | 3689 if (fLineNum > 0x7FFFFFFF) { |
| 3668 fParseErr->line = 0; | 3690 fParseErr->line = 0; |
| 3669 fParseErr->offset = -1; | 3691 fParseErr->offset = -1; |
| 3670 } else if (fCharNum > 0x7FFFFFFF) { | 3692 } else if (fCharNum > 0x7FFFFFFF) { |
| 3671 fParseErr->line = (int32_t)fLineNum; | 3693 fParseErr->line = (int32_t)fLineNum; |
| 3672 fParseErr->offset = -1; | 3694 fParseErr->offset = -1; |
| 3673 } else { | 3695 } else { |
| 3674 fParseErr->line = (int32_t)fLineNum; | 3696 fParseErr->line = (int32_t)fLineNum; |
| 3675 fParseErr->offset = (int32_t)fCharNum; | 3697 fParseErr->offset = (int32_t)fCharNum; |
| 3676 } | 3698 } |
| 3677 | 3699 |
| 3678 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con
text | 3700 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con
text |
| 3679 | 3701 |
| 3680 // Fill in the context. | 3702 // Fill in the context. |
| 3681 // Note: extractBetween() pins supplied indicies to the string bounds. | 3703 // Note: extractBetween() pins supplied indicies to the string bounds. |
| 3682 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); | 3704 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); |
| 3683 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); | 3705 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); |
| 3684 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI
ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); | 3706 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI
ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); |
| 3685 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L
EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); | 3707 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L
EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); |
| 3686 } | 3708 } |
| 3687 } | 3709 } |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3721 // | 3743 // |
| 3722 //------------------------------------------------------------------------------ | 3744 //------------------------------------------------------------------------------ |
| 3723 UChar32 RegexCompile::nextCharLL() { | 3745 UChar32 RegexCompile::nextCharLL() { |
| 3724 UChar32 ch; | 3746 UChar32 ch; |
| 3725 | 3747 |
| 3726 if (fPeekChar != -1) { | 3748 if (fPeekChar != -1) { |
| 3727 ch = fPeekChar; | 3749 ch = fPeekChar; |
| 3728 fPeekChar = -1; | 3750 fPeekChar = -1; |
| 3729 return ch; | 3751 return ch; |
| 3730 } | 3752 } |
| 3731 | 3753 |
| 3732 // assume we're already in the right place | 3754 // assume we're already in the right place |
| 3733 ch = UTEXT_NEXT32(fRXPat->fPattern); | 3755 ch = UTEXT_NEXT32(fRXPat->fPattern); |
| 3734 if (ch == U_SENTINEL) { | 3756 if (ch == U_SENTINEL) { |
| 3735 return ch; | 3757 return ch; |
| 3736 } | 3758 } |
| 3737 | 3759 |
| 3738 if (ch == chCR || | 3760 if (ch == chCR || |
| 3739 ch == chNEL || | 3761 ch == chNEL || |
| 3740 ch == chLS || | 3762 ch == chLS || |
| 3741 (ch == chLF && fLastChar != chCR)) { | 3763 (ch == chLF && fLastChar != chCR)) { |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3777 // | 3799 // |
| 3778 //------------------------------------------------------------------------------ | 3800 //------------------------------------------------------------------------------ |
| 3779 void RegexCompile::nextChar(RegexPatternChar &c) { | 3801 void RegexCompile::nextChar(RegexPatternChar &c) { |
| 3780 | 3802 |
| 3781 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); | 3803 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
| 3782 c.fChar = nextCharLL(); | 3804 c.fChar = nextCharLL(); |
| 3783 c.fQuoted = FALSE; | 3805 c.fQuoted = FALSE; |
| 3784 | 3806 |
| 3785 if (fQuoteMode) { | 3807 if (fQuoteMode) { |
| 3786 c.fQuoted = TRUE; | 3808 c.fQuoted = TRUE; |
| 3787 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_
LITERAL) == 0)) || | 3809 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_
LITERAL) == 0)) || |
| 3788 c.fChar == (UChar32)-1) { | 3810 c.fChar == (UChar32)-1) { |
| 3789 fQuoteMode = FALSE; // Exit quote mode, | 3811 fQuoteMode = FALSE; // Exit quote mode, |
| 3790 nextCharLL(); // discard the E | 3812 nextCharLL(); // discard the E |
| 3791 nextChar(c); // recurse to get the real next char | 3813 nextChar(c); // recurse to get the real next char |
| 3792 } | 3814 } |
| 3793 } | 3815 } |
| 3794 else if (fInBackslashQuote) { | 3816 else if (fInBackslashQuote) { |
| 3795 // The current character immediately follows a '\' | 3817 // The current character immediately follows a '\' |
| 3796 // Don't check for any further escapes, just return it as-is. | 3818 // Don't check for any further escapes, just return it as-is. |
| 3797 // Don't set c.fQuoted, because that would prevent the state machine fro
m | 3819 // Don't set c.fQuoted, because that would prevent the state machine fro
m |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3838 if (c.fChar == chBackSlash) { | 3860 if (c.fChar == chBackSlash) { |
| 3839 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); | 3861 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
| 3840 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar
LL())) { | 3862 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar
LL())) { |
| 3841 // | 3863 // |
| 3842 // A '\' sequence that is handled by ICU's standard unescapeAt f
unction. | 3864 // A '\' sequence that is handled by ICU's standard unescapeAt f
unction. |
| 3843 // Includes \uxxxx, \n, \r, many others. | 3865 // Includes \uxxxx, \n, \r, many others. |
| 3844 // Return the single equivalent character. | 3866 // Return the single equivalent character. |
| 3845 // | 3867 // |
| 3846 nextCharLL(); // get & discard the peeked char. | 3868 nextCharLL(); // get & discard the peeked char. |
| 3847 c.fQuoted = TRUE; | 3869 c.fQuoted = TRUE; |
| 3848 | 3870 |
| 3849 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength))
{ | 3871 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength))
{ |
| 3850 int32_t endIndex = (int32_t)pos; | 3872 int32_t endIndex = (int32_t)pos; |
| 3851 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd
ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); | 3873 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd
ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); |
| 3852 | 3874 |
| 3853 if (endIndex == pos) { | 3875 if (endIndex == pos) { |
| 3854 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 3876 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 3855 } | 3877 } |
| 3856 fCharNum += endIndex - pos; | 3878 fCharNum += endIndex - pos; |
| 3857 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); | 3879 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); |
| 3858 } else { | 3880 } else { |
| 3859 int32_t offset = 0; | 3881 int32_t offset = 0; |
| 3860 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX
T_UNESCAPE_CONTEXT(fRXPat->fPattern); | 3882 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX
T_UNESCAPE_CONTEXT(fRXPat->fPattern); |
| 3861 | 3883 |
| 3862 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); | 3884 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); |
| 3863 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset
, INT32_MAX, &context); | 3885 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset
, INT32_MAX, &context); |
| 3864 | 3886 |
| 3865 if (offset == 0) { | 3887 if (offset == 0) { |
| 3866 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 3888 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 3867 } else if (context.lastOffset == offset) { | 3889 } else if (context.lastOffset == offset) { |
| 3868 UTEXT_PREVIOUS32(fRXPat->fPattern); | 3890 UTEXT_PREVIOUS32(fRXPat->fPattern); |
| 3869 } else if (context.lastOffset != offset-1) { | 3891 } else if (context.lastOffset != offset-1) { |
| 3870 utext_moveIndex32(fRXPat->fPattern, offset - context.las
tOffset - 1); | 3892 utext_moveIndex32(fRXPat->fPattern, offset - context.las
tOffset - 1); |
| 3871 } | 3893 } |
| (...skipping 22 matching lines...) Expand all Loading... |
| 3894 } | 3916 } |
| 3895 c.fChar <<= 3; | 3917 c.fChar <<= 3; |
| 3896 c.fChar += ch&7; | 3918 c.fChar += ch&7; |
| 3897 if (c.fChar <= 255) { | 3919 if (c.fChar <= 255) { |
| 3898 nextCharLL(); | 3920 nextCharLL(); |
| 3899 } else { | 3921 } else { |
| 3900 // The last digit made the number too big. Forget we sa
w it. | 3922 // The last digit made the number too big. Forget we sa
w it. |
| 3901 c.fChar >>= 3; | 3923 c.fChar >>= 3; |
| 3902 } | 3924 } |
| 3903 } | 3925 } |
| 3904 c.fQuoted = TRUE; | 3926 c.fQuoted = TRUE; |
| 3905 } | 3927 } |
| 3906 else if (peekCharLL() == chQ) { | 3928 else if (peekCharLL() == chQ) { |
| 3907 // "\Q" enter quote mode, which will continue until "\E" | 3929 // "\Q" enter quote mode, which will continue until "\E" |
| 3908 fQuoteMode = TRUE; | 3930 fQuoteMode = TRUE; |
| 3909 nextCharLL(); // discard the 'Q'. | 3931 nextCharLL(); // discard the 'Q'. |
| 3910 nextChar(c); // recurse to get the real next char. | 3932 nextChar(c); // recurse to get the real next char. |
| 3911 } | 3933 } |
| 3912 else | 3934 else |
| 3913 { | 3935 { |
| 3914 // We are in a '\' escape that will be handled by the state tabl
e scanner. | 3936 // We are in a '\' escape that will be handled by the state tabl
e scanner. |
| 3915 // Just return the backslash, but remember that the following ch
ar is to | 3937 // Just return the backslash, but remember that the following ch
ar is to |
| (...skipping 27 matching lines...) Expand all Loading... |
| 3943 UChar32 RegexCompile::scanNamedChar() { | 3965 UChar32 RegexCompile::scanNamedChar() { |
| 3944 if (U_FAILURE(*fStatus)) { | 3966 if (U_FAILURE(*fStatus)) { |
| 3945 return 0; | 3967 return 0; |
| 3946 } | 3968 } |
| 3947 | 3969 |
| 3948 nextChar(fC); | 3970 nextChar(fC); |
| 3949 if (fC.fChar != chLBrace) { | 3971 if (fC.fChar != chLBrace) { |
| 3950 error(U_REGEX_PROPERTY_SYNTAX); | 3972 error(U_REGEX_PROPERTY_SYNTAX); |
| 3951 return 0; | 3973 return 0; |
| 3952 } | 3974 } |
| 3953 | 3975 |
| 3954 UnicodeString charName; | 3976 UnicodeString charName; |
| 3955 for (;;) { | 3977 for (;;) { |
| 3956 nextChar(fC); | 3978 nextChar(fC); |
| 3957 if (fC.fChar == chRBrace) { | 3979 if (fC.fChar == chRBrace) { |
| 3958 break; | 3980 break; |
| 3959 } | 3981 } |
| 3960 if (fC.fChar == -1) { | 3982 if (fC.fChar == -1) { |
| 3961 error(U_REGEX_PROPERTY_SYNTAX); | 3983 error(U_REGEX_PROPERTY_SYNTAX); |
| 3962 return 0; | 3984 return 0; |
| 3963 } | 3985 } |
| 3964 charName.append(fC.fChar); | 3986 charName.append(fC.fChar); |
| 3965 } | 3987 } |
| 3966 | 3988 |
| 3967 char name[100]; | 3989 char name[100]; |
| 3968 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || | 3990 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || |
| 3969 (uint32_t)charName.length()>=sizeof(name)) { | 3991 (uint32_t)charName.length()>=sizeof(name)) { |
| 3970 // All Unicode character names have only invariant characters. | 3992 // All Unicode character names have only invariant characters. |
| 3971 // The API to get a character, given a name, accepts only char *, forcin
g us to convert, | 3993 // The API to get a character, given a name, accepts only char *, forcin
g us to convert, |
| 3972 // which requires this error check | 3994 // which requires this error check |
| 3973 error(U_REGEX_PROPERTY_SYNTAX); | 3995 error(U_REGEX_PROPERTY_SYNTAX); |
| 3974 return 0; | 3996 return 0; |
| 3975 } | 3997 } |
| 3976 charName.extract(0, charName.length(), name, sizeof(name), US_INV); | 3998 charName.extract(0, charName.length(), name, sizeof(name), US_INV); |
| (...skipping 18 matching lines...) Expand all Loading... |
| 3995 // Return a UnicodeSet, constructed from the \P pattern, | 4017 // Return a UnicodeSet, constructed from the \P pattern, |
| 3996 // or NULL if the pattern is invalid. | 4018 // or NULL if the pattern is invalid. |
| 3997 // | 4019 // |
| 3998 //------------------------------------------------------------------------------ | 4020 //------------------------------------------------------------------------------ |
| 3999 UnicodeSet *RegexCompile::scanProp() { | 4021 UnicodeSet *RegexCompile::scanProp() { |
| 4000 UnicodeSet *uset = NULL; | 4022 UnicodeSet *uset = NULL; |
| 4001 | 4023 |
| 4002 if (U_FAILURE(*fStatus)) { | 4024 if (U_FAILURE(*fStatus)) { |
| 4003 return NULL; | 4025 return NULL; |
| 4004 } | 4026 } |
| 4027 (void)chLowerP; // Suppress compiler unused variable warning. |
| 4005 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); | 4028 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); |
| 4006 UBool negated = (fC.fChar == chP); | 4029 UBool negated = (fC.fChar == chP); |
| 4007 | 4030 |
| 4008 UnicodeString propertyName; | 4031 UnicodeString propertyName; |
| 4009 nextChar(fC); | 4032 nextChar(fC); |
| 4010 if (fC.fChar != chLBrace) { | 4033 if (fC.fChar != chLBrace) { |
| 4011 error(U_REGEX_PROPERTY_SYNTAX); | 4034 error(U_REGEX_PROPERTY_SYNTAX); |
| 4012 return NULL; | 4035 return NULL; |
| 4013 } | 4036 } |
| 4014 for (;;) { | 4037 for (;;) { |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4064 UBool savedInBackslashQuote = fInBackslashQuote; | 4087 UBool savedInBackslashQuote = fInBackslashQuote; |
| 4065 UBool savedEOLComments = fEOLComments; | 4088 UBool savedEOLComments = fEOLComments; |
| 4066 int64_t savedLineNum = fLineNum; | 4089 int64_t savedLineNum = fLineNum; |
| 4067 int64_t savedCharNum = fCharNum; | 4090 int64_t savedCharNum = fCharNum; |
| 4068 UChar32 savedLastChar = fLastChar; | 4091 UChar32 savedLastChar = fLastChar; |
| 4069 UChar32 savedPeekChar = fPeekChar; | 4092 UChar32 savedPeekChar = fPeekChar; |
| 4070 RegexPatternChar savedfC = fC; | 4093 RegexPatternChar savedfC = fC; |
| 4071 | 4094 |
| 4072 // Scan for a closing ]. A little tricky because there are some perverse | 4095 // Scan for a closing ]. A little tricky because there are some perverse |
| 4073 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre
ssion, | 4096 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre
ssion, |
| 4074 // ending on the second closing ]. | 4097 // ending on the second closing ]. |
| 4075 | 4098 |
| 4076 UnicodeString propName; | 4099 UnicodeString propName; |
| 4077 UBool negated = FALSE; | 4100 UBool negated = FALSE; |
| 4078 | 4101 |
| 4079 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette
r:] | 4102 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette
r:] |
| 4080 nextChar(fC); | 4103 nextChar(fC); |
| 4081 if (fC.fChar == chUp) { | 4104 if (fC.fChar == chUp) { |
| 4082 negated = TRUE; | 4105 negated = TRUE; |
| 4083 nextChar(fC); | 4106 nextChar(fC); |
| 4084 } | 4107 } |
| 4085 | 4108 |
| 4086 // Scan for the closing ":]", collecting the property name along the way. | 4109 // Scan for the closing ":]", collecting the property name along the way. |
| 4087 UBool sawPropSetTerminator = FALSE; | 4110 UBool sawPropSetTerminator = FALSE; |
| 4088 for (;;) { | 4111 for (;;) { |
| 4089 propName.append(fC.fChar); | 4112 propName.append(fC.fChar); |
| 4090 nextChar(fC); | 4113 nextChar(fC); |
| 4091 if (fC.fQuoted || fC.fChar == -1) { | 4114 if (fC.fQuoted || fC.fChar == -1) { |
| 4092 // Escaped characters or end of input - either says this isn't a [:P
roperty:] | 4115 // Escaped characters or end of input - either says this isn't a [:P
roperty:] |
| 4093 break; | 4116 break; |
| 4094 } | 4117 } |
| 4095 if (fC.fChar == chColon) { | 4118 if (fC.fChar == chColon) { |
| 4096 nextChar(fC); | 4119 nextChar(fC); |
| 4097 if (fC.fChar == chRBracket) { | 4120 if (fC.fChar == chRBracket) { |
| 4098 sawPropSetTerminator = TRUE; | 4121 sawPropSetTerminator = TRUE; |
| 4099 } | 4122 } |
| 4100 break; | 4123 break; |
| 4101 } | 4124 } |
| 4102 } | 4125 } |
| 4103 | 4126 |
| 4104 if (sawPropSetTerminator) { | 4127 if (sawPropSetTerminator) { |
| 4105 uset = createSetForProperty(propName, negated); | 4128 uset = createSetForProperty(propName, negated); |
| 4106 } | 4129 } |
| 4107 else | 4130 else |
| 4108 { | 4131 { |
| 4109 // No closing ":]". | 4132 // No closing ":]". |
| 4110 // Restore the original scan position. | 4133 // Restore the original scan position. |
| 4111 // The main scanner will retry the input as a normal set expression, | 4134 // The main scanner will retry the input as a normal set expression, |
| 4112 // not a [:Property:] expression. | 4135 // not a [:Property:] expression. |
| 4113 fScanIndex = savedScanIndex; | 4136 fScanIndex = savedScanIndex; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 4126 | 4149 |
| 4127 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { | 4150 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { |
| 4128 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); | 4151 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); |
| 4129 addCategory(set, U_GC_CF_MASK, ec); | 4152 addCategory(set, U_GC_CF_MASK, ec); |
| 4130 } | 4153 } |
| 4131 | 4154 |
| 4132 // | 4155 // |
| 4133 // Create a Unicode Set from a Unicode Property expression. | 4156 // Create a Unicode Set from a Unicode Property expression. |
| 4134 // This is common code underlying both \p{...} ane [:...:] expressions. | 4157 // This is common code underlying both \p{...} ane [:...:] expressions. |
| 4135 // Includes trying the Java "properties" that aren't supported as | 4158 // Includes trying the Java "properties" that aren't supported as |
| 4136 // normal ICU UnicodeSet properties | 4159 // normal ICU UnicodeSet properties |
| 4137 // | 4160 // |
| 4138 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" | 4161 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" |
| 4139 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" | 4162 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" |
| 4140 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
ool negated) { | 4163 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
ool negated) { |
| 4141 UnicodeString setExpr; | 4164 UnicodeString setExpr; |
| 4142 UnicodeSet *set; | 4165 UnicodeSet *set; |
| 4143 uint32_t usetFlags = 0; | 4166 uint32_t usetFlags = 0; |
| 4144 | 4167 |
| 4145 if (U_FAILURE(*fStatus)) { | 4168 if (U_FAILURE(*fStatus)) { |
| 4146 return NULL; | 4169 return NULL; |
| 4147 } | 4170 } |
| 4148 | 4171 |
| 4149 // | 4172 // |
| 4150 // First try the property as we received it | 4173 // First try the property as we received it |
| 4151 // | 4174 // |
| 4152 if (negated) { | 4175 if (negated) { |
| 4153 setExpr.append(negSetPrefix, -1); | 4176 setExpr.append(negSetPrefix, -1); |
| 4154 } else { | 4177 } else { |
| 4155 setExpr.append(posSetPrefix, -1); | 4178 setExpr.append(posSetPrefix, -1); |
| 4156 } | 4179 } |
| 4157 setExpr.append(propName); | 4180 setExpr.append(propName); |
| 4158 setExpr.append(chRBrace); | 4181 setExpr.append(chRBrace); |
| 4159 setExpr.append(chRBracket); | 4182 setExpr.append(chRBracket); |
| 4160 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 4183 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 4161 usetFlags |= USET_CASE_INSENSITIVE; | 4184 usetFlags |= USET_CASE_INSENSITIVE; |
| 4162 } | 4185 } |
| 4163 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); | 4186 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); |
| 4164 if (U_SUCCESS(*fStatus)) { | 4187 if (U_SUCCESS(*fStatus)) { |
| 4165 return set; | 4188 return set; |
| 4166 } | 4189 } |
| 4167 delete set; | 4190 delete set; |
| 4168 set = NULL; | 4191 set = NULL; |
| 4169 | 4192 |
| 4170 // | 4193 // |
| 4171 // The property as it was didn't work. | 4194 // The property as it was didn't work. |
| 4172 | 4195 |
| 4173 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n
ot standard POSIX | 4196 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n
ot standard POSIX |
| 4174 // or standard Java, but many other regular expression packages do recog
nize it. | 4197 // or standard Java, but many other regular expression packages do recog
nize it. |
| 4175 | 4198 |
| 4176 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { | 4199 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { |
| 4177 *fStatus = U_ZERO_ERROR; | 4200 *fStatus = U_ZERO_ERROR; |
| 4178 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); | 4201 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); |
| 4179 if (set == NULL) { | 4202 if (set == NULL) { |
| 4180 *fStatus = U_MEMORY_ALLOCATION_ERROR; | 4203 *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| 4181 return set; | 4204 return set; |
| 4182 } | 4205 } |
| 4183 if (negated) { | 4206 if (negated) { |
| 4184 set->complement(); | 4207 set->complement(); |
| 4185 } | 4208 } |
| 4186 return set; | 4209 return set; |
| 4187 } | 4210 } |
| 4188 | 4211 |
| 4189 | 4212 |
| 4190 // Do Java fixes - | 4213 // Do Java fixes - |
| 4191 // InGreek -> InGreek or Coptic, that being the official Unicode name
for that block. | 4214 // InGreek -> InGreek or Coptic, that being the official Unicode name
for that block. |
| 4192 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols
. | 4215 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols
. |
| 4193 // | 4216 // |
| 4194 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin
g Marks for Symbols" | 4217 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin
g Marks for Symbols" |
| 4195 // is accepted by Java. The property part of the nam
e is compared | 4218 // is accepted by Java. The property part of the nam
e is compared |
| 4196 // case-insenstively. The spaces must be exactly as
shown, either | 4219 // case-insenstively. The spaces must be exactly as
shown, either |
| 4197 // all there, or all omitted, with exactly one at eac
h position | 4220 // all there, or all omitted, with exactly one at eac
h position |
| 4198 // if they are present. From checking against JDK 1.
6 | 4221 // if they are present. From checking against JDK 1.
6 |
| 4199 // | 4222 // |
| 4200 // This code should be removed when ICU properties support the Java c
ompatibility names | 4223 // This code should be removed when ICU properties support the Java c
ompatibility names |
| 4201 // (ICU 4.0?) | 4224 // (ICU 4.0?) |
| 4202 // | 4225 // |
| 4203 UnicodeString mPropName = propName; | 4226 UnicodeString mPropName = propName; |
| 4204 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { | 4227 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { |
| 4205 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); | 4228 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); |
| 4206 } | 4229 } |
| 4207 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo
ls"), 0) == 0 || | 4230 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo
ls"), 0) == 0 || |
| 4208 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"
), 0) == 0) { | 4231 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"
), 0) == 0) { |
| 4209 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym
bols"); | 4232 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym
bols"); |
| 4210 } | 4233 } |
| 4211 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { | 4234 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { |
| 4212 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); | 4235 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); |
| 4213 } | 4236 } |
| 4214 | 4237 |
| 4215 // See if the property looks like a Java "InBlockName", which | 4238 // See if the property looks like a Java "InBlockName", which |
| 4216 // we will recast as "Block=BlockName" | 4239 // we will recast as "Block=BlockName" |
| 4217 // | 4240 // |
| 4218 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" | 4241 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" |
| 4219 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "
Block=" | 4242 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "
Block=" |
| 4220 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { | 4243 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { |
| 4221 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" | 4244 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" |
| 4222 setExpr.append(BLOCK, -1); | 4245 setExpr.append(BLOCK, -1); |
| 4223 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi
ng "In" removed. | 4246 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi
ng "In" removed. |
| 4224 setExpr.append(chRBrace); | 4247 setExpr.append(chRBrace); |
| (...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4328 } | 4351 } |
| 4329 if (negated) { | 4352 if (negated) { |
| 4330 set->complement(); | 4353 set->complement(); |
| 4331 } | 4354 } |
| 4332 return set; | 4355 return set; |
| 4333 } | 4356 } |
| 4334 delete set; | 4357 delete set; |
| 4335 set = NULL; | 4358 set = NULL; |
| 4336 } | 4359 } |
| 4337 error(*fStatus); | 4360 error(*fStatus); |
| 4338 return NULL; | 4361 return NULL; |
| 4339 } | 4362 } |
| 4340 | 4363 |
| 4341 | 4364 |
| 4342 | 4365 |
| 4343 // | 4366 // |
| 4344 // SetEval Part of the evaluation of [set expressions]. | 4367 // SetEval Part of the evaluation of [set expressions]. |
| 4345 // Perform any pending (stacked) operations with precedence | 4368 // Perform any pending (stacked) operations with precedence |
| 4346 // equal or greater to that of the next operator encountered | 4369 // equal or greater to that of the next operator encountered |
| 4347 // in the expression. | 4370 // in the expression. |
| 4348 // | 4371 // |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4396 | 4419 |
| 4397 void RegexCompile::setPushOp(int32_t op) { | 4420 void RegexCompile::setPushOp(int32_t op) { |
| 4398 setEval(op); | 4421 setEval(op); |
| 4399 fSetOpStack.push(op, *fStatus); | 4422 fSetOpStack.push(op, *fStatus); |
| 4400 fSetStack.push(new UnicodeSet(), *fStatus); | 4423 fSetStack.push(new UnicodeSet(), *fStatus); |
| 4401 } | 4424 } |
| 4402 | 4425 |
| 4403 U_NAMESPACE_END | 4426 U_NAMESPACE_END |
| 4404 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 4427 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 4405 | 4428 |
| OLD | NEW |