| OLD | NEW |
| 1 // | 1 // |
| 2 // file: regexcmp.cpp | 2 // file: regexcmp.cpp |
| 3 // | 3 // |
| 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. | 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. |
| 5 // All Rights Reserved. | 5 // All Rights Reserved. |
| 6 // | 6 // |
| 7 // This file contains the ICU regular expression compiler, which is responsible | 7 // This file contains the ICU regular expression compiler, which is responsible |
| 8 // for processing a regular expression pattern into the compiled form that | 8 // for processing a regular expression pattern into the compiled form that |
| 9 // is used by the match finding engine. | 9 // is used by the match finding engine. |
| 10 // | 10 // |
| (...skipping 283 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 294 n *= 10; | 294 n *= 10; |
| 295 } | 295 } |
| 296 | 296 |
| 297 // | 297 // |
| 298 // The pattern's fFrameSize so far has accumulated the requirements for | 298 // The pattern's fFrameSize so far has accumulated the requirements for |
| 299 // storage for capture parentheses, counters, etc. that are encountered | 299 // storage for capture parentheses, counters, etc. that are encountered |
| 300 // in the pattern. Add space for the two variables that are always | 300 // in the pattern. Add space for the two variables that are always |
| 301 // present in the saved state: the input string position (int64_t) and | 301 // present in the saved state: the input string position (int64_t) and |
| 302 // the position in the compiled pattern. | 302 // the position in the compiled pattern. |
| 303 // | 303 // |
| 304 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; | 304 allocateStackData(RESTACKFRAME_HDRCOUNT); |
| 305 | 305 |
| 306 // | 306 // |
| 307 // Optimization pass 1: NOPs, back-references, and case-folding | 307 // Optimization pass 1: NOPs, back-references, and case-folding |
| 308 // | 308 // |
| 309 stripNOPs(); | 309 stripNOPs(); |
| 310 | 310 |
| 311 // | 311 // |
| 312 // Get bounds for the minimum and maximum length of a string that this | 312 // Get bounds for the minimum and maximum length of a string that this |
| 313 // pattern can match. Used to avoid looking for matches in strings that | 313 // pattern can match. Used to avoid looking for matches in strings that |
| 314 // are too short. | 314 // are too short. |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 360 | 360 |
| 361 case doPatStart: | 361 case doPatStart: |
| 362 // Start of pattern compiles to: | 362 // Start of pattern compiles to: |
| 363 //0 SAVE 2 Fall back to position of FAIL | 363 //0 SAVE 2 Fall back to position of FAIL |
| 364 //1 jmp 3 | 364 //1 jmp 3 |
| 365 //2 FAIL Stop if we ever reach here. | 365 //2 FAIL Stop if we ever reach here. |
| 366 //3 NOP Dummy, so start of pattern looks the same as | 366 //3 NOP Dummy, so start of pattern looks the same as |
| 367 // the start of an ( grouping. | 367 // the start of an ( grouping. |
| 368 //4 NOP Resreved, will be replaced by a save if there are | 368 //4 NOP Resreved, will be replaced by a save if there are |
| 369 // OR | operators at the top level | 369 // OR | operators at the top level |
| 370 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus)
; | 370 appendOp(URX_STATE_SAVE, 2); |
| 371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); | 371 appendOp(URX_JMP, 3); |
| 372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); | 372 appendOp(URX_FAIL, 0); |
| 373 | 373 |
| 374 // Standard open nonCapture paren action emits the two NOPs and | 374 // Standard open nonCapture paren action emits the two NOPs and |
| 375 // sets up the paren stack frame. | 375 // sets up the paren stack frame. |
| 376 doParseActions(doOpenNonCaptureParen); | 376 doParseActions(doOpenNonCaptureParen); |
| 377 break; | 377 break; |
| 378 | 378 |
| 379 case doPatFinish: | 379 case doPatFinish: |
| 380 // We've scanned to the end of the pattern | 380 // We've scanned to the end of the pattern |
| 381 // The end of pattern compiles to: | 381 // The end of pattern compiles to: |
| 382 // URX_END | 382 // URX_END |
| 383 // which will stop the runtime match engine. | 383 // which will stop the runtime match engine. |
| 384 // Encountering end of pattern also behaves like a close paren, | 384 // Encountering end of pattern also behaves like a close paren, |
| 385 // and forces fixups of the State Save at the beginning of the compile
d pattern | 385 // and forces fixups of the State Save at the beginning of the compile
d pattern |
| 386 // and of any OR operations at the top level. | 386 // and of any OR operations at the top level. |
| 387 // | 387 // |
| 388 handleCloseParen(); | 388 handleCloseParen(); |
| 389 if (fParenStack.size() > 0) { | 389 if (fParenStack.size() > 0) { |
| 390 // Missing close paren in pattern. | 390 // Missing close paren in pattern. |
| 391 error(U_REGEX_MISMATCHED_PAREN); | 391 error(U_REGEX_MISMATCHED_PAREN); |
| 392 } | 392 } |
| 393 | 393 |
| 394 // add the END operation to the compiled pattern. | 394 // add the END operation to the compiled pattern. |
| 395 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); | 395 appendOp(URX_END, 0); |
| 396 | 396 |
| 397 // Terminate the pattern compilation state machine. | 397 // Terminate the pattern compilation state machine. |
| 398 returnVal = FALSE; | 398 returnVal = FALSE; |
| 399 break; | 399 break; |
| 400 | 400 |
| 401 | 401 |
| 402 | 402 |
| 403 case doOrOperator: | 403 case doOrOperator: |
| 404 // Scanning a '|', as in (A|B) | 404 // Scanning a '|', as in (A|B) |
| 405 { | 405 { |
| 406 // Generate code for any pending literals preceding the '|' | 406 // Generate code for any pending literals preceding the '|' |
| 407 fixLiterals(FALSE); | 407 fixLiterals(FALSE); |
| 408 | 408 |
| 409 // Insert a SAVE operation at the start of the pattern section prece
ding | 409 // Insert a SAVE operation at the start of the pattern section prece
ding |
| 410 // this OR at this level. This SAVE will branch the match forward | 410 // this OR at this level. This SAVE will branch the match forward |
| 411 // to the right hand side of the OR in the event that the left han
d | 411 // to the right hand side of the OR in the event that the left han
d |
| 412 // side fails to match and backtracks. Locate the position for th
e | 412 // side fails to match and backtracks. Locate the position for th
e |
| 413 // save from the location on the top of the parentheses stack. | 413 // save from the location on the top of the parentheses stack. |
| 414 int32_t savePosition = fParenStack.popi(); | 414 int32_t savePosition = fParenStack.popi(); |
| 415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; | 415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; |
| 416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location | 416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location |
| 417 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | 417 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
| 418 fRXPat->fCompiledPat->setElementAt(op, savePosition); | 418 fRXPat->fCompiledPat->setElementAt(op, savePosition); |
| 419 | 419 |
| 420 // Append an JMP operation into the compiled pattern. The operand f
or | 420 // Append an JMP operation into the compiled pattern. The operand f
or |
| 421 // the JMP will eventually be the location following the ')' for th
e | 421 // the JMP will eventually be the location following the ')' for th
e |
| 422 // group. This will be patched in later, when the ')' is encounter
ed. | 422 // group. This will be patched in later, when the ')' is encounter
ed. |
| 423 op = URX_BUILD(URX_JMP, 0); | 423 appendOp(URX_JMP, 0); |
| 424 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 425 | 424 |
| 426 // Push the position of the newly added JMP op onto the parentheses
stack. | 425 // Push the position of the newly added JMP op onto the parentheses
stack. |
| 427 // This registers if for fixup when this block's close paren is enco
untered. | 426 // This registers if for fixup when this block's close paren is enco
untered. |
| 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 427 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 429 | 428 |
| 430 // Append a NOP to the compiled pattern. This is the slot reserved | 429 // Append a NOP to the compiled pattern. This is the slot reserved |
| 431 // for a SAVE in the event that there is yet another '|' following | 430 // for a SAVE in the event that there is yet another '|' following |
| 432 // this one. | 431 // this one. |
| 433 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 432 appendOp(URX_NOP, 0); |
| 434 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 433 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 435 } | 434 } |
| 436 break; | 435 break; |
| 437 | 436 |
| 438 | 437 |
| 439 case doOpenCaptureParen: | 438 case doOpenCaptureParen: |
| 440 // Open Paren. | 439 // Open Paren. |
| 441 // Compile to a | 440 // Compile to a |
| 442 // - NOP, which later may be replaced by a save-state if the | 441 // - NOP, which later may be replaced by a save-state if the |
| 443 // parenthesized group gets a * quantifier, followed by | 442 // parenthesized group gets a * quantifier, followed by |
| 444 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. | 443 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
| 445 // - NOP, which may later be replaced by a save-state if there | 444 // - NOP, which may later be replaced by a save-state if there |
| 446 // is an '|' alternation within the parens. | 445 // is an '|' alternation within the parens. |
| 447 // | 446 // |
| 448 // Each capture group gets three slots in the save stack frame: | 447 // Each capture group gets three slots in the save stack frame: |
| 449 // 0: Capture Group start position (in input string being matche
d.) | 448 // 0: Capture Group start position (in input string being matche
d.) |
| 450 // 1: Capture Group end position. | 449 // 1: Capture Group end position. |
| 451 // 2: Start of Match-in-progress. | 450 // 2: Start of Match-in-progress. |
| 452 // The first two locations are for a completed capture group, and are | 451 // The first two locations are for a completed capture group, and are |
| 453 // referred to by back references and the like. | 452 // referred to by back references and the like. |
| 454 // The third location stores the capture start position when an START
_CAPTURE is | 453 // The third location stores the capture start position when an START
_CAPTURE is |
| 455 // encountered. This will be promoted to a completed capture when
(and if) the corresponding | 454 // encountered. This will be promoted to a completed capture when
(and if) the corresponding |
| 456 // END_CAPTURE is encountered. | 455 // END_CAPTURE is encountered. |
| 457 { | 456 { |
| 458 fixLiterals(); | 457 fixLiterals(); |
| 459 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 458 appendOp(URX_NOP, 0); |
| 460 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. | 459 int32_t varsLoc = allocateStackData(3); // Reserve three slots i
n match stack frame. |
| 461 fRXPat->fFrameSize += 3; | 460 appendOp(URX_START_CAPTURE, varsLoc); |
| 462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); | 461 appendOp(URX_NOP, 0); |
| 463 fRXPat->fCompiledPat->addElement(cop, *fStatus); | |
| 464 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 465 | 462 |
| 466 // On the Parentheses stack, start a new frame and add the postions | 463 // On the Parentheses stack, start a new frame and add the postions |
| 467 // of the two NOPs. Depending on what follows in the pattern, the | 464 // of the two NOPs. Depending on what follows in the pattern, the |
| 468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 465 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 469 // address of the end of the parenthesized group. | 466 // address of the end of the parenthesized group. |
| 470 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 467 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 471 fParenStack.push(capturing, *fStatus); // Fra
me type. | 468 fParenStack.push(capturing, *fStatus); // Fra
me type. |
| 472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location | 469 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
| 473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 470 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 474 | 471 |
| 475 // Save the mapping from group number to stack frame variable positi
on. | 472 // Save the mapping from group number to stack frame variable positi
on. |
| 476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 473 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
| 477 } | 474 } |
| 478 break; | 475 break; |
| 479 | 476 |
| 480 case doOpenNonCaptureParen: | 477 case doOpenNonCaptureParen: |
| 481 // Open non-caputuring (grouping only) Paren. | 478 // Open non-caputuring (grouping only) Paren. |
| 482 // Compile to a | 479 // Compile to a |
| 483 // - NOP, which later may be replaced by a save-state if the | 480 // - NOP, which later may be replaced by a save-state if the |
| 484 // parenthesized group gets a * quantifier, followed by | 481 // parenthesized group gets a * quantifier, followed by |
| 485 // - NOP, which may later be replaced by a save-state if there | 482 // - NOP, which may later be replaced by a save-state if there |
| 486 // is an '|' alternation within the parens. | 483 // is an '|' alternation within the parens. |
| 487 { | 484 { |
| 488 fixLiterals(); | 485 fixLiterals(); |
| 489 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 486 appendOp(URX_NOP, 0); |
| 490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 487 appendOp(URX_NOP, 0); |
| 491 | 488 |
| 492 // On the Parentheses stack, start a new frame and add the postions | 489 // On the Parentheses stack, start a new frame and add the postions |
| 493 // of the two NOPs. | 490 // of the two NOPs. |
| 494 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 491 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 495 fParenStack.push(plain, *fStatus); // Beg
in a new frame. | 492 fParenStack.push(plain, *fStatus); // Beg
in a new frame. |
| 496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 493 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 494 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
| 498 } | 495 } |
| 499 break; | 496 break; |
| 500 | 497 |
| 501 | 498 |
| 502 case doOpenAtomicParen: | 499 case doOpenAtomicParen: |
| 503 // Open Atomic Paren. (?> | 500 // Open Atomic Paren. (?> |
| 504 // Compile to a | 501 // Compile to a |
| 505 // - NOP, which later may be replaced if the parenthesized group | 502 // - NOP, which later may be replaced if the parenthesized group |
| 506 // has a quantifier, followed by | 503 // has a quantifier, followed by |
| 507 // - STO_SP save state stack position, so it can be restored at th
e ")" | 504 // - STO_SP save state stack position, so it can be restored at th
e ")" |
| 508 // - NOP, which may later be replaced by a save-state if there | 505 // - NOP, which may later be replaced by a save-state if there |
| 509 // is an '|' alternation within the parens. | 506 // is an '|' alternation within the parens. |
| 510 { | 507 { |
| 511 fixLiterals(); | 508 fixLiterals(); |
| 512 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 509 appendOp(URX_NOP, 0); |
| 513 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the | 510 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the state stack ptr. |
| 514 fRXPat->fDataSize += 1; // state stack ptr. | 511 appendOp(URX_STO_SP, varLoc); |
| 515 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); | 512 appendOp(URX_NOP, 0); |
| 516 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); | |
| 517 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 518 | 513 |
| 519 // On the Parentheses stack, start a new frame and add the postions | 514 // On the Parentheses stack, start a new frame and add the postions |
| 520 // of the two NOPs. Depending on what follows in the pattern, the | 515 // of the two NOPs. Depending on what follows in the pattern, the |
| 521 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 516 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
| 522 // address of the end of the parenthesized group. | 517 // address of the end of the parenthesized group. |
| 523 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 518 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 524 fParenStack.push(atomic, *fStatus); // Fra
me type. | 519 fParenStack.push(atomic, *fStatus); // Fra
me type. |
| 525 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP | 520 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP |
| 526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 521 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
| 527 } | 522 } |
| (...skipping 22 matching lines...) Expand all Loading... |
| 550 // 6. NOP reserved for use by quantifiers on the block
. | 545 // 6. NOP reserved for use by quantifiers on the block
. |
| 551 // Look-ahead can't have quantifiers, but paren
stack | 546 // Look-ahead can't have quantifiers, but paren
stack |
| 552 // compile time conventions require the slot
anyhow. | 547 // compile time conventions require the slot
anyhow. |
| 553 // 7. NOP may be replaced if there is are '|' ops in t
he block. | 548 // 7. NOP may be replaced if there is are '|' ops in t
he block. |
| 554 // 8. code for parenthesized stuff. | 549 // 8. code for parenthesized stuff. |
| 555 // 9. LA_END | 550 // 9. LA_END |
| 556 // | 551 // |
| 557 // Two data slots are reserved, for saving the stack ptr and the input
position. | 552 // Two data slots are reserved, for saving the stack ptr and the input
position. |
| 558 { | 553 { |
| 559 fixLiterals(); | 554 fixLiterals(); |
| 560 int32_t dataLoc = fRXPat->fDataSize; | 555 int32_t dataLoc = allocateData(2); |
| 561 fRXPat->fDataSize += 2; | 556 appendOp(URX_LA_START, dataLoc); |
| 562 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 557 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
| 563 fRXPat->fCompiledPat->addElement(op, *fStatus); | 558 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
| 564 | 559 appendOp(URX_LA_END, dataLoc); |
| 565 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | 560 appendOp(URX_BACKTRACK, 0); |
| 566 fRXPat->fCompiledPat->addElement(op, *fStatus); | 561 appendOp(URX_NOP, 0); |
| 567 | 562 appendOp(URX_NOP, 0); |
| 568 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | |
| 569 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 570 | |
| 571 op = URX_BUILD(URX_LA_END, dataLoc); | |
| 572 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 573 | |
| 574 op = URX_BUILD(URX_BACKTRACK, 0); | |
| 575 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 576 | |
| 577 op = URX_BUILD(URX_NOP, 0); | |
| 578 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 579 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 580 | 563 |
| 581 // On the Parentheses stack, start a new frame and add the postions | 564 // On the Parentheses stack, start a new frame and add the postions |
| 582 // of the NOPs. | 565 // of the NOPs. |
| 583 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 566 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 584 fParenStack.push(lookAhead, *fStatus); // Fra
me type. | 567 fParenStack.push(lookAhead, *fStatus); // Fra
me type. |
| 585 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 568 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 586 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 569 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
| 587 } | 570 } |
| 588 break; | 571 break; |
| 589 | 572 |
| 590 case doOpenLookAheadNeg: | 573 case doOpenLookAheadNeg: |
| 591 // Negated Lookahead. (?! stuff ) | 574 // Negated Lookahead. (?! stuff ) |
| 592 // Compiles to | 575 // Compiles to |
| 593 // 1. START_LA dataloc | 576 // 1. START_LA dataloc |
| 594 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, | 577 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, |
| 595 // // which continues with the match. | 578 // // which continues with the match. |
| 596 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' | 579 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' |
| 597 // 4. code for parenthesized stuff. | 580 // 4. code for parenthesized stuff. |
| 598 // 5. END_LA // Cut back stack, remove saved state
from step 2. | 581 // 5. END_LA // Cut back stack, remove saved state
from step 2. |
| 599 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. | 582 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. |
| 600 // 7. END_LA // Restore match region, in case look-
ahead was using | 583 // 7. END_LA // Restore match region, in case look-
ahead was using |
| 601 // an alternate (transparent) reg
ion. | 584 // an alternate (transparent) reg
ion. |
| 602 { | 585 { |
| 603 fixLiterals(); | 586 fixLiterals(); |
| 604 int32_t dataLoc = fRXPat->fDataSize; | 587 int32_t dataLoc = allocateData(2); |
| 605 fRXPat->fDataSize += 2; | 588 appendOp(URX_LA_START, dataLoc); |
| 606 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 589 appendOp(URX_STATE_SAVE, 0); // dest address will be patched late
r. |
| 607 fRXPat->fCompiledPat->addElement(op, *fStatus); | 590 appendOp(URX_NOP, 0); |
| 608 | |
| 609 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. | |
| 610 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 611 | |
| 612 op = URX_BUILD(URX_NOP, 0); | |
| 613 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 614 | 591 |
| 615 // On the Parentheses stack, start a new frame and add the postions | 592 // On the Parentheses stack, start a new frame and add the postions |
| 616 // of the StateSave and NOP. | 593 // of the StateSave and NOP. |
| 617 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 594 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 618 fParenStack.push(negLookAhead, *fStatus); // Fram
e type | 595 fParenStack.push(negLookAhead, *fStatus); // Fram
e type |
| 619 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location | 596 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location |
| 620 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 597 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
| 621 | 598 |
| 622 // Instructions #5 - #7 will be added when the ')' is encountered. | 599 // Instructions #5 - #7 will be added when the ')' is encountered. |
| 623 } | 600 } |
| (...skipping 17 matching lines...) Expand all Loading... |
| 641 // Allocate a block of matcher data, to contain (when runni
ng a match) | 618 // Allocate a block of matcher data, to contain (when runni
ng a match) |
| 642 // 0: Stack ptr on entry | 619 // 0: Stack ptr on entry |
| 643 // 1: Input Index on entry | 620 // 1: Input Index on entry |
| 644 // 2: Start index of match current match attempt. | 621 // 2: Start index of match current match attempt. |
| 645 // 3: Original Input String len. | 622 // 3: Original Input String len. |
| 646 | 623 |
| 647 // Generate match code for any pending literals. | 624 // Generate match code for any pending literals. |
| 648 fixLiterals(); | 625 fixLiterals(); |
| 649 | 626 |
| 650 // Allocate data space | 627 // Allocate data space |
| 651 int32_t dataLoc = fRXPat->fDataSize; | 628 int32_t dataLoc = allocateData(4); |
| 652 fRXPat->fDataSize += 4; | |
| 653 | 629 |
| 654 // Emit URX_LB_START | 630 // Emit URX_LB_START |
| 655 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 631 appendOp(URX_LB_START, dataLoc); |
| 656 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 657 | 632 |
| 658 // Emit URX_LB_CONT | 633 // Emit URX_LB_CONT |
| 659 op = URX_BUILD(URX_LB_CONT, dataLoc); | 634 appendOp(URX_LB_CONT, dataLoc); |
| 660 fRXPat->fCompiledPat->addElement(op, *fStatus); | 635 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled la
ter. |
| 661 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. | 636 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled la
ter. |
| 662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. | |
| 663 | 637 |
| 664 // Emit the NOP | 638 // Emit the NOPs |
| 665 op = URX_BUILD(URX_NOP, 0); | 639 appendOp(URX_NOP, 0); |
| 666 fRXPat->fCompiledPat->addElement(op, *fStatus); | 640 appendOp(URX_NOP, 0); |
| 667 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 668 | 641 |
| 669 // On the Parentheses stack, start a new frame and add the postions | 642 // On the Parentheses stack, start a new frame and add the postions |
| 670 // of the URX_LB_CONT and the NOP. | 643 // of the URX_LB_CONT and the NOP. |
| 671 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 644 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 672 fParenStack.push(lookBehind, *fStatus); // Fra
me type | 645 fParenStack.push(lookBehind, *fStatus); // Fra
me type |
| 673 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 646 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 674 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 647 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
| 675 | 648 |
| 676 // The final two instructions will be added when the ')' is encounte
red. | 649 // The final two instructions will be added when the ')' is encounte
red. |
| 677 } | 650 } |
| (...skipping 19 matching lines...) Expand all Loading... |
| 697 // Allocate a block of matcher data, to contain (when runni
ng a match) | 670 // Allocate a block of matcher data, to contain (when runni
ng a match) |
| 698 // 0: Stack ptr on entry | 671 // 0: Stack ptr on entry |
| 699 // 1: Input Index on entry | 672 // 1: Input Index on entry |
| 700 // 2: Start index of match current match attempt. | 673 // 2: Start index of match current match attempt. |
| 701 // 3: Original Input String len. | 674 // 3: Original Input String len. |
| 702 | 675 |
| 703 // Generate match code for any pending literals. | 676 // Generate match code for any pending literals. |
| 704 fixLiterals(); | 677 fixLiterals(); |
| 705 | 678 |
| 706 // Allocate data space | 679 // Allocate data space |
| 707 int32_t dataLoc = fRXPat->fDataSize; | 680 int32_t dataLoc = allocateData(4); |
| 708 fRXPat->fDataSize += 4; | |
| 709 | 681 |
| 710 // Emit URX_LB_START | 682 // Emit URX_LB_START |
| 711 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 683 appendOp(URX_LB_START, dataLoc); |
| 712 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 713 | 684 |
| 714 // Emit URX_LBN_CONT | 685 // Emit URX_LBN_CONT |
| 715 op = URX_BUILD(URX_LBN_CONT, dataLoc); | 686 appendOp(URX_LBN_CONT, dataLoc); |
| 716 fRXPat->fCompiledPat->addElement(op, *fStatus); | 687 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled la
ter. |
| 717 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. | 688 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled la
ter. |
| 718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. | 689 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled la
ter. |
| 719 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. | |
| 720 | 690 |
| 721 // Emit the NOP | 691 // Emit the NOPs |
| 722 op = URX_BUILD(URX_NOP, 0); | 692 appendOp(URX_NOP, 0); |
| 723 fRXPat->fCompiledPat->addElement(op, *fStatus); | 693 appendOp(URX_NOP, 0); |
| 724 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 725 | 694 |
| 726 // On the Parentheses stack, start a new frame and add the postions | 695 // On the Parentheses stack, start a new frame and add the postions |
| 727 // of the URX_LB_CONT and the NOP. | 696 // of the URX_LB_CONT and the NOP. |
| 728 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 697 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
| 729 fParenStack.push(lookBehindN, *fStatus); // Fra
me type | 698 fParenStack.push(lookBehindN, *fStatus); // Fra
me type |
| 730 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 699 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
| 731 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 700 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
| 732 | 701 |
| 733 // The final two instructions will be added when the ')' is encounte
red. | 702 // The final two instructions will be added when the ')' is encounte
red. |
| 734 } | 703 } |
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 784 { | 753 { |
| 785 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 | 754 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 |
| 786 int32_t frameLoc; | 755 int32_t frameLoc; |
| 787 | 756 |
| 788 // Check for simple constructs, which may get special optimized code
. | 757 // Check for simple constructs, which may get special optimized code
. |
| 789 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 758 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
| 790 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 759 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
| 791 | 760 |
| 792 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 761 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 793 // Emit optimized code for [char set]+ | 762 // Emit optimized code for [char set]+ |
| 794 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 763 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); |
| 795 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | 764 frameLoc = allocateStackData(1); |
| 796 frameLoc = fRXPat->fFrameSize; | 765 appendOp(URX_LOOP_C, frameLoc); |
| 797 fRXPat->fFrameSize++; | |
| 798 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
| 799 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 800 break; | 766 break; |
| 801 } | 767 } |
| 802 | 768 |
| 803 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 769 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
| 804 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 770 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 805 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 771 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 806 // Emit Optimized code for .+ operations. | 772 // Emit Optimized code for .+ operations. |
| 807 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 773 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); |
| 808 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 774 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 809 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. | 775 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. |
| 810 loopOpI |= 1; | 776 loopOpI |= 1; |
| 811 } | 777 } |
| 812 if (fModeFlags & UREGEX_UNIX_LINES) { | 778 if (fModeFlags & UREGEX_UNIX_LINES) { |
| 813 loopOpI |= 2; | 779 loopOpI |= 2; |
| 814 } | 780 } |
| 815 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | 781 appendOp(loopOpI); |
| 816 frameLoc = fRXPat->fFrameSize; | 782 frameLoc = allocateStackData(1); |
| 817 fRXPat->fFrameSize++; | 783 appendOp(URX_LOOP_C, frameLoc); |
| 818 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
| 819 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 820 break; | 784 break; |
| 821 } | 785 } |
| 822 | 786 |
| 823 } | 787 } |
| 824 | 788 |
| 825 // General case. | 789 // General case. |
| 826 | 790 |
| 827 // Check for minimum match length of zero, which requires | 791 // Check for minimum match length of zero, which requires |
| 828 // extra loop-breaking code. | 792 // extra loop-breaking code. |
| 829 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { | 793 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { |
| 830 // Zero length match is possible. | 794 // Zero length match is possible. |
| 831 // Emit the code sequence that can handle it. | 795 // Emit the code sequence that can handle it. |
| 832 insertOp(topLoc); | 796 insertOp(topLoc); |
| 833 frameLoc = fRXPat->fFrameSize; | 797 frameLoc = allocateStackData(1); |
| 834 fRXPat->fFrameSize++; | |
| 835 | 798 |
| 836 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | 799 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); |
| 837 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 800 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 838 | 801 |
| 839 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | 802 appendOp(URX_JMP_SAV_X, topLoc+1); |
| 840 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 841 } else { | 803 } else { |
| 842 // Simpler code when the repeated body must match something non-
empty | 804 // Simpler code when the repeated body must match something non-
empty |
| 843 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); | 805 appendOp(URX_JMP_SAV, topLoc); |
| 844 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | |
| 845 } | 806 } |
| 846 } | 807 } |
| 847 break; | 808 break; |
| 848 | 809 |
| 849 case doNGPlus: | 810 case doNGPlus: |
| 850 // Non-greedy '+?' compiles to | 811 // Non-greedy '+?' compiles to |
| 851 // 1. stuff to be repeated (already built) | 812 // 1. stuff to be repeated (already built) |
| 852 // 2. state-save 1 | 813 // 2. state-save 1 |
| 853 // 3. ... | 814 // 3. ... |
| 854 { | 815 { |
| 855 int32_t topLoc = blockTopLoc(FALSE); | 816 int32_t topLoc = blockTopLoc(FALSE); |
| 856 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | 817 appendOp(URX_STATE_SAVE, topLoc); |
| 857 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); | |
| 858 } | 818 } |
| 859 break; | 819 break; |
| 860 | 820 |
| 861 | 821 |
| 862 case doOpt: | 822 case doOpt: |
| 863 // Normal (greedy) ? quantifier. | 823 // Normal (greedy) ? quantifier. |
| 864 // Compiles to | 824 // Compiles to |
| 865 // 1. state save 3 | 825 // 1. state save 3 |
| 866 // 2. body of optional block | 826 // 2. body of optional block |
| 867 // 3. ... | 827 // 3. ... |
| 868 // Insert the state save into the compiled pattern, and we're done. | 828 // Insert the state save into the compiled pattern, and we're done. |
| 869 { | 829 { |
| 870 int32_t saveStateLoc = blockTopLoc(TRUE); | 830 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 871 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiled
Pat->size()); | 831 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPa
t->size()); |
| 872 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 832 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 873 } | 833 } |
| 874 break; | 834 break; |
| 875 | 835 |
| 876 case doNGOpt: | 836 case doNGOpt: |
| 877 // Non-greedy ?? quantifier | 837 // Non-greedy ?? quantifier |
| 878 // compiles to | 838 // compiles to |
| 879 // 1. jmp 4 | 839 // 1. jmp 4 |
| 880 // 2. body of optional block | 840 // 2. body of optional block |
| 881 // 3 jmp 5 | 841 // 3 jmp 5 |
| 882 // 4. state save 2 | 842 // 4. state save 2 |
| 883 // 5 ... | 843 // 5 ... |
| 884 // This code is less than ideal, with two jmps instead of one, because
we can only | 844 // This code is less than ideal, with two jmps instead of one, because
we can only |
| 885 // insert one instruction at the top of the block being iterated. | 845 // insert one instruction at the top of the block being iterated. |
| 886 { | 846 { |
| 887 int32_t jmp1_loc = blockTopLoc(TRUE); | 847 int32_t jmp1_loc = blockTopLoc(TRUE); |
| 888 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); | 848 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
| 889 | 849 |
| 890 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); | 850 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); |
| 891 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | 851 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
| 892 | 852 |
| 893 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); | 853 appendOp(URX_JMP, jmp2_loc+2); |
| 894 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); | |
| 895 | 854 |
| 896 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | 855 appendOp(URX_STATE_SAVE, jmp1_loc+1); |
| 897 fRXPat->fCompiledPat->addElement(save_op, *fStatus); | |
| 898 } | 856 } |
| 899 break; | 857 break; |
| 900 | 858 |
| 901 | 859 |
| 902 case doStar: | 860 case doStar: |
| 903 // Normal (greedy) * quantifier. | 861 // Normal (greedy) * quantifier. |
| 904 // Compiles to | 862 // Compiles to |
| 905 // 1. STATE_SAVE 4 | 863 // 1. STATE_SAVE 4 |
| 906 // 2. body of stuff being iterated over | 864 // 2. body of stuff being iterated over |
| 907 // 3. JMP_SAV 2 | 865 // 3. JMP_SAV 2 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 927 int32_t topLoc = blockTopLoc(FALSE); | 885 int32_t topLoc = blockTopLoc(FALSE); |
| 928 int32_t dataLoc = -1; | 886 int32_t dataLoc = -1; |
| 929 | 887 |
| 930 // Check for simple *, where the construct being repeated | 888 // Check for simple *, where the construct being repeated |
| 931 // compiled to single opcode, and might be optimizable. | 889 // compiled to single opcode, and might be optimizable. |
| 932 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 890 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
| 933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 891 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
| 934 | 892 |
| 935 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 893 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 936 // Emit optimized code for a [char set]* | 894 // Emit optimized code for a [char set]* |
| 937 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 895 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)
); |
| 938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 896 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 939 dataLoc = fRXPat->fFrameSize; | 897 dataLoc = allocateStackData(1); |
| 940 fRXPat->fFrameSize++; | 898 appendOp(URX_LOOP_C, dataLoc); |
| 941 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
| 942 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 943 break; | 899 break; |
| 944 } | 900 } |
| 945 | 901 |
| 946 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 902 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
| 947 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 903 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 948 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 904 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 949 // Emit Optimized code for .* operations. | 905 // Emit Optimized code for .* operations. |
| 950 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 906 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); |
| 951 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 907 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 952 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. | 908 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. |
| 953 loopOpI |= 1; | 909 loopOpI |= 1; |
| 954 } | 910 } |
| 955 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { | 911 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { |
| 956 loopOpI |= 2; | 912 loopOpI |= 2; |
| 957 } | 913 } |
| 958 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 914 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 959 dataLoc = fRXPat->fFrameSize; | 915 dataLoc = allocateStackData(1); |
| 960 fRXPat->fFrameSize++; | 916 appendOp(URX_LOOP_C, dataLoc); |
| 961 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
| 962 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 963 break; | 917 break; |
| 964 } | 918 } |
| 965 } | 919 } |
| 966 | 920 |
| 967 // Emit general case code for this * | 921 // Emit general case code for this * |
| 968 // The optimizations did not apply. | 922 // The optimizations did not apply. |
| 969 | 923 |
| 970 int32_t saveStateLoc = blockTopLoc(TRUE); | 924 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 971 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | 925 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); |
| 972 | 926 |
| 973 // Check for minimum match length of zero, which requires | 927 // Check for minimum match length of zero, which requires |
| 974 // extra loop-breaking code. | 928 // extra loop-breaking code. |
| 975 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { | 929 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { |
| 976 insertOp(saveStateLoc); | 930 insertOp(saveStateLoc); |
| 977 dataLoc = fRXPat->fFrameSize; | 931 dataLoc = allocateStackData(1); |
| 978 fRXPat->fFrameSize++; | |
| 979 | 932 |
| 980 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | 933 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); |
| 981 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | 934 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); |
| 982 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | 935 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); |
| 983 } | 936 } |
| 984 | 937 |
| 985 // Locate the position in the compiled pattern where the match will
continue | 938 // Locate the position in the compiled pattern where the match will
continue |
| 986 // after completing the *. (4 or 5 in the comment above) | 939 // after completing the *. (4 or 5 in the comment above) |
| 987 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 940 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 988 | 941 |
| 989 // Put together the save state op store it into the compiled code. | 942 // Put together the save state op and store it into the compiled cod
e. |
| 990 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | 943 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); |
| 991 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 944 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 992 | 945 |
| 993 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. | 946 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. |
| 994 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | 947 appendOp(jmpOp); |
| 995 } | 948 } |
| 996 break; | 949 break; |
| 997 | 950 |
| 998 case doNGStar: | 951 case doNGStar: |
| 999 // Non-greedy *? quantifier | 952 // Non-greedy *? quantifier |
| 1000 // compiles to | 953 // compiles to |
| 1001 // 1. JMP 3 | 954 // 1. JMP 3 |
| 1002 // 2. body of stuff being iterated over | 955 // 2. body of stuff being iterated over |
| 1003 // 3. STATE_SAVE 2 | 956 // 3. STATE_SAVE 2 |
| 1004 // 4 ... | 957 // 4 ... |
| 1005 { | 958 { |
| 1006 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. | 959 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. |
| 1007 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. | 960 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. |
| 1008 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); | 961 int32_t jmpOp = buildOp(URX_JMP, saveLoc); |
| 1009 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | |
| 1010 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | 962 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
| 1011 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); | 963 appendOp(URX_STATE_SAVE, jmpLoc+1); |
| 1012 } | 964 } |
| 1013 break; | 965 break; |
| 1014 | 966 |
| 1015 | 967 |
| 1016 case doIntervalInit: | 968 case doIntervalInit: |
| 1017 // The '{' opening an interval quantifier was just scanned. | 969 // The '{' opening an interval quantifier was just scanned. |
| 1018 // Init the counter varaiables that will accumulate the values as the di
gits | 970 // Init the counter varaiables that will accumulate the values as the di
gits |
| 1019 // are scanned. | 971 // are scanned. |
| 1020 fIntervalLow = 0; | 972 fIntervalLow = 0; |
| 1021 fIntervalUpper = -1; | 973 fIntervalUpper = -1; |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1070 int32_t topLoc = blockTopLoc(FALSE); | 1022 int32_t topLoc = blockTopLoc(FALSE); |
| 1071 | 1023 |
| 1072 // Produce normal looping code. | 1024 // Produce normal looping code. |
| 1073 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); | 1025 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); |
| 1074 | 1026 |
| 1075 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP | 1027 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP |
| 1076 // just as if the loop was inclosed in atomic parentheses. | 1028 // just as if the loop was inclosed in atomic parentheses. |
| 1077 | 1029 |
| 1078 // First the STO_SP before the start of the loop | 1030 // First the STO_SP before the start of the loop |
| 1079 insertOp(topLoc); | 1031 insertOp(topLoc); |
| 1080 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the | 1032 |
| 1081 fRXPat->fDataSize += 1; // state stack ptr. | 1033 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the |
| 1082 int32_t op = URX_BUILD(URX_STO_SP, varLoc); | 1034 int32_t op = buildOp(URX_STO_SP, varLoc); |
| 1083 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1035 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1084 | 1036 |
| 1085 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); | 1037 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); |
| 1086 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); | 1038 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); |
| 1087 loopOp++; // point LoopOp after the just-inserted STO_SP | 1039 loopOp++; // point LoopOp after the just-inserted STO_SP |
| 1088 fRXPat->fCompiledPat->push(loopOp, *fStatus); | 1040 fRXPat->fCompiledPat->push(loopOp, *fStatus); |
| 1089 | 1041 |
| 1090 // Then the LD_SP after the end of the loop | 1042 // Then the LD_SP after the end of the loop |
| 1091 op = URX_BUILD(URX_LD_SP, varLoc); | 1043 appendOp(URX_LD_SP, varLoc); |
| 1092 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1093 } | 1044 } |
| 1094 | 1045 |
| 1095 break; | 1046 break; |
| 1096 | 1047 |
| 1097 case doNGInterval: | 1048 case doNGInterval: |
| 1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. | 1049 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. |
| 1099 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); | 1050 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); |
| 1100 break; | 1051 break; |
| 1101 | 1052 |
| 1102 case doIntervalError: | 1053 case doIntervalError: |
| (...skipping 15 matching lines...) Expand all Loading... |
| 1118 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 1069 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 1119 } | 1070 } |
| 1120 literalChar(fC.fChar); | 1071 literalChar(fC.fChar); |
| 1121 break; | 1072 break; |
| 1122 | 1073 |
| 1123 | 1074 |
| 1124 case doDotAny: | 1075 case doDotAny: |
| 1125 // scanned a ".", match any single character. | 1076 // scanned a ".", match any single character. |
| 1126 { | 1077 { |
| 1127 fixLiterals(FALSE); | 1078 fixLiterals(FALSE); |
| 1128 int32_t op; | |
| 1129 if (fModeFlags & UREGEX_DOTALL) { | 1079 if (fModeFlags & UREGEX_DOTALL) { |
| 1130 op = URX_BUILD(URX_DOTANY_ALL, 0); | 1080 appendOp(URX_DOTANY_ALL, 0); |
| 1131 } else if (fModeFlags & UREGEX_UNIX_LINES) { | 1081 } else if (fModeFlags & UREGEX_UNIX_LINES) { |
| 1132 op = URX_BUILD(URX_DOTANY_UNIX, 0); | 1082 appendOp(URX_DOTANY_UNIX, 0); |
| 1133 } else { | 1083 } else { |
| 1134 op = URX_BUILD(URX_DOTANY, 0); | 1084 appendOp(URX_DOTANY, 0); |
| 1135 } | 1085 } |
| 1136 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1137 } | 1086 } |
| 1138 break; | 1087 break; |
| 1139 | 1088 |
| 1140 case doCaret: | 1089 case doCaret: |
| 1141 { | 1090 { |
| 1142 fixLiterals(FALSE); | 1091 fixLiterals(FALSE); |
| 1143 int32_t op = 0; | |
| 1144 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1092 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1145 op = URX_CARET; | 1093 appendOp(URX_CARET, 0); |
| 1146 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1094 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1147 op = URX_CARET_M; | 1095 appendOp(URX_CARET_M, 0); |
| 1148 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1096 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1149 op = URX_CARET; // Only testing true start of input. | 1097 appendOp(URX_CARET, 0); // Only testing true start of input. |
| 1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1098 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1151 op = URX_CARET_M_UNIX; | 1099 appendOp(URX_CARET_M_UNIX, 0); |
| 1152 } | 1100 } |
| 1153 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
| 1154 } | 1101 } |
| 1155 break; | 1102 break; |
| 1156 | 1103 |
| 1157 case doDollar: | 1104 case doDollar: |
| 1158 { | 1105 { |
| 1159 fixLiterals(FALSE); | 1106 fixLiterals(FALSE); |
| 1160 int32_t op = 0; | |
| 1161 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1107 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1162 op = URX_DOLLAR; | 1108 appendOp(URX_DOLLAR, 0); |
| 1163 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1109 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
| 1164 op = URX_DOLLAR_M; | 1110 appendOp(URX_DOLLAR_M, 0); |
| 1165 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1111 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1166 op = URX_DOLLAR_D; | 1112 appendOp(URX_DOLLAR_D, 0); |
| 1167 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1113 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
| 1168 op = URX_DOLLAR_MD; | 1114 appendOp(URX_DOLLAR_MD, 0); |
| 1169 } | 1115 } |
| 1170 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
| 1171 } | 1116 } |
| 1172 break; | 1117 break; |
| 1173 | 1118 |
| 1174 case doBackslashA: | 1119 case doBackslashA: |
| 1175 fixLiterals(FALSE); | 1120 fixLiterals(FALSE); |
| 1176 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); | 1121 appendOp(URX_CARET, 0); |
| 1177 break; | 1122 break; |
| 1178 | 1123 |
| 1179 case doBackslashB: | 1124 case doBackslashB: |
| 1180 { | 1125 { |
| 1181 #if UCONFIG_NO_BREAK_ITERATION==1 | 1126 #if UCONFIG_NO_BREAK_ITERATION==1 |
| 1182 if (fModeFlags & UREGEX_UWORD) { | 1127 if (fModeFlags & UREGEX_UWORD) { |
| 1183 error(U_UNSUPPORTED_ERROR); | 1128 error(U_UNSUPPORTED_ERROR); |
| 1184 } | 1129 } |
| 1185 #endif | 1130 #endif |
| 1186 fixLiterals(FALSE); | 1131 fixLiterals(FALSE); |
| 1187 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1132 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
| 1188 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); | 1133 appendOp(op, 1); |
| 1189 } | 1134 } |
| 1190 break; | 1135 break; |
| 1191 | 1136 |
| 1192 case doBackslashb: | 1137 case doBackslashb: |
| 1193 { | 1138 { |
| 1194 #if UCONFIG_NO_BREAK_ITERATION==1 | 1139 #if UCONFIG_NO_BREAK_ITERATION==1 |
| 1195 if (fModeFlags & UREGEX_UWORD) { | 1140 if (fModeFlags & UREGEX_UWORD) { |
| 1196 error(U_UNSUPPORTED_ERROR); | 1141 error(U_UNSUPPORTED_ERROR); |
| 1197 } | 1142 } |
| 1198 #endif | 1143 #endif |
| 1199 fixLiterals(FALSE); | 1144 fixLiterals(FALSE); |
| 1200 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1145 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
| 1201 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | 1146 appendOp(op, 0); |
| 1202 } | 1147 } |
| 1203 break; | 1148 break; |
| 1204 | 1149 |
| 1205 case doBackslashD: | 1150 case doBackslashD: |
| 1206 fixLiterals(FALSE); | 1151 fixLiterals(FALSE); |
| 1207 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus
); | 1152 appendOp(URX_BACKSLASH_D, 1); |
| 1208 break; | 1153 break; |
| 1209 | 1154 |
| 1210 case doBackslashd: | 1155 case doBackslashd: |
| 1211 fixLiterals(FALSE); | 1156 fixLiterals(FALSE); |
| 1212 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus
); | 1157 appendOp(URX_BACKSLASH_D, 0); |
| 1213 break; | 1158 break; |
| 1214 | 1159 |
| 1215 case doBackslashG: | 1160 case doBackslashG: |
| 1216 fixLiterals(FALSE); | 1161 fixLiterals(FALSE); |
| 1217 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus
); | 1162 appendOp(URX_BACKSLASH_G, 0); |
| 1218 break; | 1163 break; |
| 1219 | 1164 |
| 1220 case doBackslashS: | 1165 case doBackslashS: |
| 1221 fixLiterals(FALSE); | 1166 fixLiterals(FALSE); |
| 1222 fRXPat->fCompiledPat->addElement( | 1167 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); |
| 1223 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); | |
| 1224 break; | 1168 break; |
| 1225 | 1169 |
| 1226 case doBackslashs: | 1170 case doBackslashs: |
| 1227 fixLiterals(FALSE); | 1171 fixLiterals(FALSE); |
| 1228 fRXPat->fCompiledPat->addElement( | 1172 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); |
| 1229 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); | |
| 1230 break; | 1173 break; |
| 1231 | 1174 |
| 1232 case doBackslashW: | 1175 case doBackslashW: |
| 1233 fixLiterals(FALSE); | 1176 fixLiterals(FALSE); |
| 1234 fRXPat->fCompiledPat->addElement( | 1177 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); |
| 1235 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); | |
| 1236 break; | 1178 break; |
| 1237 | 1179 |
| 1238 case doBackslashw: | 1180 case doBackslashw: |
| 1239 fixLiterals(FALSE); | 1181 fixLiterals(FALSE); |
| 1240 fRXPat->fCompiledPat->addElement( | 1182 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); |
| 1241 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); | |
| 1242 break; | 1183 break; |
| 1243 | 1184 |
| 1244 case doBackslashX: | 1185 case doBackslashX: |
| 1245 fixLiterals(FALSE); | 1186 fixLiterals(FALSE); |
| 1246 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus
); | 1187 appendOp(URX_BACKSLASH_X, 0); |
| 1247 break; | 1188 break; |
| 1248 | 1189 |
| 1249 | 1190 |
| 1250 case doBackslashZ: | 1191 case doBackslashZ: |
| 1251 fixLiterals(FALSE); | 1192 fixLiterals(FALSE); |
| 1252 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); | 1193 appendOp(URX_DOLLAR, 0); |
| 1253 break; | 1194 break; |
| 1254 | 1195 |
| 1255 case doBackslashz: | 1196 case doBackslashz: |
| 1256 fixLiterals(FALSE); | 1197 fixLiterals(FALSE); |
| 1257 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus
); | 1198 appendOp(URX_BACKSLASH_Z, 0); |
| 1258 break; | 1199 break; |
| 1259 | 1200 |
| 1260 case doEscapeError: | 1201 case doEscapeError: |
| 1261 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 1202 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
| 1262 break; | 1203 break; |
| 1263 | 1204 |
| 1264 case doExit: | 1205 case doExit: |
| 1265 fixLiterals(FALSE); | 1206 fixLiterals(FALSE); |
| 1266 returnVal = FALSE; | 1207 returnVal = FALSE; |
| 1267 break; | 1208 break; |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1307 } | 1248 } |
| 1308 | 1249 |
| 1309 // Scan of the back reference in the source regexp is complete. Now
generate | 1250 // Scan of the back reference in the source regexp is complete. Now
generate |
| 1310 // the compiled code for it. | 1251 // the compiled code for it. |
| 1311 // Because capture groups can be forward-referenced by back-referenc
es, | 1252 // Because capture groups can be forward-referenced by back-referenc
es, |
| 1312 // we fill the operand with the capture group number. At the end | 1253 // we fill the operand with the capture group number. At the end |
| 1313 // of compilation, it will be changed to the variable's location. | 1254 // of compilation, it will be changed to the variable's location. |
| 1314 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, | 1255 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, |
| 1315 // and shouldn't enter this code path at
all. | 1256 // and shouldn't enter this code path at
all. |
| 1316 fixLiterals(FALSE); | 1257 fixLiterals(FALSE); |
| 1317 int32_t op; | |
| 1318 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1258 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1319 op = URX_BUILD(URX_BACKREF_I, groupNum); | 1259 appendOp(URX_BACKREF_I, groupNum); |
| 1320 } else { | 1260 } else { |
| 1321 op = URX_BUILD(URX_BACKREF, groupNum); | 1261 appendOp(URX_BACKREF, groupNum); |
| 1322 } | 1262 } |
| 1323 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1324 } | 1263 } |
| 1325 break; | 1264 break; |
| 1326 | 1265 |
| 1327 | 1266 |
| 1328 case doPossessivePlus: | 1267 case doPossessivePlus: |
| 1329 // Possessive ++ quantifier. | 1268 // Possessive ++ quantifier. |
| 1330 // Compiles to | 1269 // Compiles to |
| 1331 // 1. STO_SP | 1270 // 1. STO_SP |
| 1332 // 2. body of stuff being iterated over | 1271 // 2. body of stuff being iterated over |
| 1333 // 3. STATE_SAVE 5 | 1272 // 3. STATE_SAVE 5 |
| 1334 // 4. JMP 2 | 1273 // 4. JMP 2 |
| 1335 // 5. LD_SP | 1274 // 5. LD_SP |
| 1336 // 6. ... | 1275 // 6. ... |
| 1337 // | 1276 // |
| 1338 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up | 1277 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up |
| 1339 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 | 1278 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 |
| 1340 // | 1279 // |
| 1341 { | 1280 { |
| 1342 // Emit the STO_SP | 1281 // Emit the STO_SP |
| 1343 int32_t topLoc = blockTopLoc(TRUE); | 1282 int32_t topLoc = blockTopLoc(TRUE); |
| 1344 int32_t stoLoc = fRXPat->fDataSize; | 1283 int32_t stoLoc = allocateData(1); // Reserve the data location fo
r storing save stack ptr. |
| 1345 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | 1284 int32_t op = buildOp(URX_STO_SP, stoLoc); |
| 1346 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
| 1347 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1285 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1348 | 1286 |
| 1349 // Emit the STATE_SAVE | 1287 // Emit the STATE_SAVE |
| 1350 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | 1288 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
| 1351 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1352 | 1289 |
| 1353 // Emit the JMP | 1290 // Emit the JMP |
| 1354 op = URX_BUILD(URX_JMP, topLoc+1); | 1291 appendOp(URX_JMP, topLoc+1); |
| 1355 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1356 | 1292 |
| 1357 // Emit the LD_SP | 1293 // Emit the LD_SP |
| 1358 op = URX_BUILD(URX_LD_SP, stoLoc); | 1294 appendOp(URX_LD_SP, stoLoc); |
| 1359 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1360 } | 1295 } |
| 1361 break; | 1296 break; |
| 1362 | 1297 |
| 1363 case doPossessiveStar: | 1298 case doPossessiveStar: |
| 1364 // Possessive *+ quantifier. | 1299 // Possessive *+ quantifier. |
| 1365 // Compiles to | 1300 // Compiles to |
| 1366 // 1. STO_SP loc | 1301 // 1. STO_SP loc |
| 1367 // 2. STATE_SAVE 5 | 1302 // 2. STATE_SAVE 5 |
| 1368 // 3. body of stuff being iterated over | 1303 // 3. body of stuff being iterated over |
| 1369 // 4. JMP 2 | 1304 // 4. JMP 2 |
| 1370 // 5. LD_SP loc | 1305 // 5. LD_SP loc |
| 1371 // 6 ... | 1306 // 6 ... |
| 1372 // TODO: do something to cut back the state stack each time through the
loop. | 1307 // TODO: do something to cut back the state stack each time through the
loop. |
| 1373 { | 1308 { |
| 1374 // Reserve two slots at the top of the block. | 1309 // Reserve two slots at the top of the block. |
| 1375 int32_t topLoc = blockTopLoc(TRUE); | 1310 int32_t topLoc = blockTopLoc(TRUE); |
| 1376 insertOp(topLoc); | 1311 insertOp(topLoc); |
| 1377 | 1312 |
| 1378 // emit STO_SP loc | 1313 // emit STO_SP loc |
| 1379 int32_t stoLoc = fRXPat->fDataSize; | 1314 int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. |
| 1380 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | 1315 int32_t op = buildOp(URX_STO_SP, stoLoc); |
| 1381 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
| 1382 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1316 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1383 | 1317 |
| 1384 // Emit the SAVE_STATE 5 | 1318 // Emit the SAVE_STATE 5 |
| 1385 int32_t L7 = fRXPat->fCompiledPat->size()+1; | 1319 int32_t L7 = fRXPat->fCompiledPat->size()+1; |
| 1386 op = URX_BUILD(URX_STATE_SAVE, L7); | 1320 op = buildOp(URX_STATE_SAVE, L7); |
| 1387 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1321 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 1388 | 1322 |
| 1389 // Append the JMP operation. | 1323 // Append the JMP operation. |
| 1390 op = URX_BUILD(URX_JMP, topLoc+1); | 1324 appendOp(URX_JMP, topLoc+1); |
| 1391 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1392 | 1325 |
| 1393 // Emit the LD_SP loc | 1326 // Emit the LD_SP loc |
| 1394 op = URX_BUILD(URX_LD_SP, stoLoc); | 1327 appendOp(URX_LD_SP, stoLoc); |
| 1395 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1396 } | 1328 } |
| 1397 break; | 1329 break; |
| 1398 | 1330 |
| 1399 case doPossessiveOpt: | 1331 case doPossessiveOpt: |
| 1400 // Possessive ?+ quantifier. | 1332 // Possessive ?+ quantifier. |
| 1401 // Compiles to | 1333 // Compiles to |
| 1402 // 1. STO_SP loc | 1334 // 1. STO_SP loc |
| 1403 // 2. SAVE_STATE 5 | 1335 // 2. SAVE_STATE 5 |
| 1404 // 3. body of optional block | 1336 // 3. body of optional block |
| 1405 // 4. LD_SP loc | 1337 // 4. LD_SP loc |
| 1406 // 5. ... | 1338 // 5. ... |
| 1407 // | 1339 // |
| 1408 { | 1340 { |
| 1409 // Reserve two slots at the top of the block. | 1341 // Reserve two slots at the top of the block. |
| 1410 int32_t topLoc = blockTopLoc(TRUE); | 1342 int32_t topLoc = blockTopLoc(TRUE); |
| 1411 insertOp(topLoc); | 1343 insertOp(topLoc); |
| 1412 | 1344 |
| 1413 // Emit the STO_SP | 1345 // Emit the STO_SP |
| 1414 int32_t stoLoc = fRXPat->fDataSize; | 1346 int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. |
| 1415 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | 1347 int32_t op = buildOp(URX_STO_SP, stoLoc); |
| 1416 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
| 1417 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1348 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 1418 | 1349 |
| 1419 // Emit the SAVE_STATE | 1350 // Emit the SAVE_STATE |
| 1420 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 1351 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 1421 op = URX_BUILD(URX_STATE_SAVE, continueLoc); | 1352 op = buildOp(URX_STATE_SAVE, continueLoc); |
| 1422 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1353 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 1423 | 1354 |
| 1424 // Emit the LD_SP | 1355 // Emit the LD_SP |
| 1425 op = URX_BUILD(URX_LD_SP, stoLoc); | 1356 appendOp(URX_LD_SP, stoLoc); |
| 1426 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1427 } | 1357 } |
| 1428 break; | 1358 break; |
| 1429 | 1359 |
| 1430 | 1360 |
| 1431 case doBeginMatchMode: | 1361 case doBeginMatchMode: |
| 1432 fNewModeFlags = fModeFlags; | 1362 fNewModeFlags = fModeFlags; |
| 1433 fSetModeFlag = TRUE; | 1363 fSetModeFlag = TRUE; |
| 1434 break; | 1364 break; |
| 1435 | 1365 |
| 1436 case doMatchMode: // (?i) and similar | 1366 case doMatchMode: // (?i) and similar |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1473 // We've got a (?i: or similar. Begin a parenthesized block, save old | 1403 // We've got a (?i: or similar. Begin a parenthesized block, save old |
| 1474 // mode flags so they can be restored at the close of the block. | 1404 // mode flags so they can be restored at the close of the block. |
| 1475 // | 1405 // |
| 1476 // Compile to a | 1406 // Compile to a |
| 1477 // - NOP, which later may be replaced by a save-state if the | 1407 // - NOP, which later may be replaced by a save-state if the |
| 1478 // parenthesized group gets a * quantifier, followed by | 1408 // parenthesized group gets a * quantifier, followed by |
| 1479 // - NOP, which may later be replaced by a save-state if there | 1409 // - NOP, which may later be replaced by a save-state if there |
| 1480 // is an '|' alternation within the parens. | 1410 // is an '|' alternation within the parens. |
| 1481 { | 1411 { |
| 1482 fixLiterals(FALSE); | 1412 fixLiterals(FALSE); |
| 1483 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 1413 appendOp(URX_NOP, 0); |
| 1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 1414 appendOp(URX_NOP, 0); |
| 1485 | 1415 |
| 1486 // On the Parentheses stack, start a new frame and add the postions | 1416 // On the Parentheses stack, start a new frame and add the postions |
| 1487 // of the two NOPs (a normal non-capturing () frame, except for th
e | 1417 // of the two NOPs (a normal non-capturing () frame, except for th
e |
| 1488 // saving of the orignal mode flags.) | 1418 // saving of the orignal mode flags.) |
| 1489 fParenStack.push(fModeFlags, *fStatus); | 1419 fParenStack.push(fModeFlags, *fStatus); |
| 1490 fParenStack.push(flags, *fStatus); // Fra
me Marker | 1420 fParenStack.push(flags, *fStatus); // Fra
me Marker |
| 1491 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP | 1421 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP |
| 1492 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 1422 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
| 1493 | 1423 |
| 1494 // Set the current mode flags to the new values. | 1424 // Set the current mode flags to the new values. |
| (...skipping 316 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1811 // string in a pattern, emit the code to match the | 1741 // string in a pattern, emit the code to match the |
| 1812 // accumulated literal string. | 1742 // accumulated literal string. |
| 1813 // | 1743 // |
| 1814 // Optionally, split the last char of the string off in
to | 1744 // Optionally, split the last char of the string off in
to |
| 1815 // a single "ONE_CHAR" operation, so that quantifiers c
an | 1745 // a single "ONE_CHAR" operation, so that quantifiers c
an |
| 1816 // apply to that char alone. Example: abc* | 1746 // apply to that char alone. Example: abc* |
| 1817 // The * must apply to the 'c' only. | 1747 // The * must apply to the 'c' only. |
| 1818 // | 1748 // |
| 1819 //------------------------------------------------------------------------------ | 1749 //------------------------------------------------------------------------------ |
| 1820 void RegexCompile::fixLiterals(UBool split) { | 1750 void RegexCompile::fixLiterals(UBool split) { |
| 1821 int32_t op = 0; // An op from/for the compiled patter
n. | |
| 1822 | 1751 |
| 1823 // If no literal characters have been scanned but not yet had code generated | 1752 // If no literal characters have been scanned but not yet had code generated |
| 1824 // for them, nothing needs to be done. | 1753 // for them, nothing needs to be done. |
| 1825 if (fLiteralChars.length() == 0) { | 1754 if (fLiteralChars.length() == 0) { |
| 1826 return; | 1755 return; |
| 1827 } | 1756 } |
| 1828 | 1757 |
| 1829 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); | 1758 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); |
| 1830 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1759 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
| 1831 | 1760 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1850 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1779 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1851 fLiteralChars.foldCase(); | 1780 fLiteralChars.foldCase(); |
| 1852 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); | 1781 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); |
| 1853 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1782 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
| 1854 } | 1783 } |
| 1855 | 1784 |
| 1856 if (indexOfLastCodePoint == 0) { | 1785 if (indexOfLastCodePoint == 0) { |
| 1857 // Single character, emit a URX_ONECHAR op to match it. | 1786 // Single character, emit a URX_ONECHAR op to match it. |
| 1858 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | 1787 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && |
| 1859 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { | 1788 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { |
| 1860 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); | 1789 appendOp(URX_ONECHAR_I, lastCodePoint); |
| 1861 } else { | 1790 } else { |
| 1862 op = URX_BUILD(URX_ONECHAR, lastCodePoint); | 1791 appendOp(URX_ONECHAR, lastCodePoint); |
| 1863 } | 1792 } |
| 1864 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1865 } else { | 1793 } else { |
| 1866 // Two or more chars, emit a URX_STRING to match them. | 1794 // Two or more chars, emit a URX_STRING to match them. |
| 1795 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length()
> 0x00ffffff) { |
| 1796 error(U_REGEX_PATTERN_TOO_BIG); |
| 1797 } |
| 1867 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1798 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 1868 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); | 1799 appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); |
| 1869 } else { | 1800 } else { |
| 1870 // TODO here: add optimization to split case sensitive strings of l
ength two | 1801 // TODO here: add optimization to split case sensitive strings of l
ength two |
| 1871 // into two single char ops, for efficiency. | 1802 // into two single char ops, for efficiency. |
| 1872 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); | 1803 appendOp(URX_STRING, fRXPat->fLiteralText.length()); |
| 1873 } | 1804 } |
| 1874 fRXPat->fCompiledPat->addElement(op, *fStatus); | 1805 appendOp(URX_STRING_LEN, fLiteralChars.length()); |
| 1875 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); | |
| 1876 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1877 | 1806 |
| 1878 // Add this string into the accumulated strings of the compiled pattern. | 1807 // Add this string into the accumulated strings of the compiled pattern. |
| 1879 fRXPat->fLiteralText.append(fLiteralChars); | 1808 fRXPat->fLiteralText.append(fLiteralChars); |
| 1880 } | 1809 } |
| 1881 | 1810 |
| 1882 fLiteralChars.remove(); | 1811 fLiteralChars.remove(); |
| 1883 } | 1812 } |
| 1884 | 1813 |
| 1885 | 1814 |
| 1886 | 1815 int32_t RegexCompile::buildOp(int32_t type, int32_t val) { |
| 1887 | 1816 if (U_FAILURE(*fStatus)) { |
| 1817 return 0; |
| 1818 } |
| 1819 if (type < 0 || type > 255) { |
| 1820 U_ASSERT(FALSE); |
| 1821 error(U_REGEX_INTERNAL_ERROR); |
| 1822 type = URX_RESERVED_OP; |
| 1823 } |
| 1824 if (val > 0x00ffffff) { |
| 1825 U_ASSERT(FALSE); |
| 1826 error(U_REGEX_INTERNAL_ERROR); |
| 1827 val = 0; |
| 1828 } |
| 1829 if (val < 0) { |
| 1830 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { |
| 1831 U_ASSERT(FALSE); |
| 1832 error(U_REGEX_INTERNAL_ERROR); |
| 1833 return -1; |
| 1834 } |
| 1835 if (URX_TYPE(val) != 0xff) { |
| 1836 U_ASSERT(FALSE); |
| 1837 error(U_REGEX_INTERNAL_ERROR); |
| 1838 return -1; |
| 1839 } |
| 1840 type = URX_RESERVED_OP_N; |
| 1841 } |
| 1842 return (type << 24) | val; |
| 1843 } |
| 1888 | 1844 |
| 1889 | 1845 |
| 1890 //------------------------------------------------------------------------------ | 1846 //------------------------------------------------------------------------------ |
| 1847 // |
| 1848 // appendOp() Append a new instruction onto the compiled pattern |
| 1849 // Includes error checking, limiting the size of the |
| 1850 // pattern to lengths that can be represented in the |
| 1851 // 24 bit operand field of an instruction. |
| 1852 // |
| 1853 //------------------------------------------------------------------------------ |
| 1854 void RegexCompile::appendOp(int32_t op) { |
| 1855 if (U_FAILURE(*fStatus)) { |
| 1856 return; |
| 1857 } |
| 1858 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1859 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { |
| 1860 error(U_REGEX_PATTERN_TOO_BIG); |
| 1861 } |
| 1862 } |
| 1863 |
| 1864 void RegexCompile::appendOp(int32_t type, int32_t val) { |
| 1865 appendOp(buildOp(type, val)); |
| 1866 } |
| 1867 |
| 1868 |
| 1869 //------------------------------------------------------------------------------ |
| 1891 // | 1870 // |
| 1892 // insertOp() Insert a slot for a new opcode into the already | 1871 // insertOp() Insert a slot for a new opcode into the already |
| 1893 // compiled pattern code. | 1872 // compiled pattern code. |
| 1894 // | 1873 // |
| 1895 // Fill the slot with a NOP. Our caller will replace i
t | 1874 // Fill the slot with a NOP. Our caller will replace i
t |
| 1896 // with what they really wanted. | 1875 // with what they really wanted. |
| 1897 // | 1876 // |
| 1898 //------------------------------------------------------------------------------ | 1877 //------------------------------------------------------------------------------ |
| 1899 void RegexCompile::insertOp(int32_t where) { | 1878 void RegexCompile::insertOp(int32_t where) { |
| 1900 UVector64 *code = fRXPat->fCompiledPat; | 1879 UVector64 *code = fRXPat->fCompiledPat; |
| 1901 U_ASSERT(where>0 && where < code->size()); | 1880 U_ASSERT(where>0 && where < code->size()); |
| 1902 | 1881 |
| 1903 int32_t nop = URX_BUILD(URX_NOP, 0); | 1882 int32_t nop = buildOp(URX_NOP, 0); |
| 1904 code->insertElementAt(nop, where, *fStatus); | 1883 code->insertElementAt(nop, where, *fStatus); |
| 1905 | 1884 |
| 1906 // Walk through the pattern, looking for any ops with targets that | 1885 // Walk through the pattern, looking for any ops with targets that |
| 1907 // were moved down by the insert. Fix them. | 1886 // were moved down by the insert. Fix them. |
| 1908 int32_t loc; | 1887 int32_t loc; |
| 1909 for (loc=0; loc<code->size(); loc++) { | 1888 for (loc=0; loc<code->size(); loc++) { |
| 1910 int32_t op = (int32_t)code->elementAti(loc); | 1889 int32_t op = (int32_t)code->elementAti(loc); |
| 1911 int32_t opType = URX_TYPE(op); | 1890 int32_t opType = URX_TYPE(op); |
| 1912 int32_t opValue = URX_VAL(op); | 1891 int32_t opValue = URX_VAL(op); |
| 1913 if ((opType == URX_JMP || | 1892 if ((opType == URX_JMP || |
| 1914 opType == URX_JMPX || | 1893 opType == URX_JMPX || |
| 1915 opType == URX_STATE_SAVE || | 1894 opType == URX_STATE_SAVE || |
| 1916 opType == URX_CTR_LOOP || | 1895 opType == URX_CTR_LOOP || |
| 1917 opType == URX_CTR_LOOP_NG || | 1896 opType == URX_CTR_LOOP_NG || |
| 1918 opType == URX_JMP_SAV || | 1897 opType == URX_JMP_SAV || |
| 1919 opType == URX_JMP_SAV_X || | 1898 opType == URX_JMP_SAV_X || |
| 1920 opType == URX_RELOC_OPRND) && opValue > where) { | 1899 opType == URX_RELOC_OPRND) && opValue > where) { |
| 1921 // Target location for this opcode is after the insertion point and | 1900 // Target location for this opcode is after the insertion point and |
| 1922 // needs to be incremented to adjust for the insertion. | 1901 // needs to be incremented to adjust for the insertion. |
| 1923 opValue++; | 1902 opValue++; |
| 1924 op = URX_BUILD(opType, opValue); | 1903 op = buildOp(opType, opValue); |
| 1925 code->setElementAt(op, loc); | 1904 code->setElementAt(op, loc); |
| 1926 } | 1905 } |
| 1927 } | 1906 } |
| 1928 | 1907 |
| 1929 // Now fix up the parentheses stack. All positive values in it are location
s in | 1908 // Now fix up the parentheses stack. All positive values in it are location
s in |
| 1930 // the compiled pattern. (Negative values are frame boundaries, and don't
need fixing.) | 1909 // the compiled pattern. (Negative values are frame boundaries, and don't
need fixing.) |
| 1931 for (loc=0; loc<fParenStack.size(); loc++) { | 1910 for (loc=0; loc<fParenStack.size(); loc++) { |
| 1932 int32_t x = fParenStack.elementAti(loc); | 1911 int32_t x = fParenStack.elementAti(loc); |
| 1933 U_ASSERT(x < code->size()); | 1912 U_ASSERT(x < code->size()); |
| 1934 if (x>where) { | 1913 if (x>where) { |
| 1935 x++; | 1914 x++; |
| 1936 fParenStack.setElementAt(x, loc); | 1915 fParenStack.setElementAt(x, loc); |
| 1937 } | 1916 } |
| 1938 } | 1917 } |
| 1939 | 1918 |
| 1940 if (fMatchCloseParen > where) { | 1919 if (fMatchCloseParen > where) { |
| 1941 fMatchCloseParen++; | 1920 fMatchCloseParen++; |
| 1942 } | 1921 } |
| 1943 if (fMatchOpenParen > where) { | 1922 if (fMatchOpenParen > where) { |
| 1944 fMatchOpenParen++; | 1923 fMatchOpenParen++; |
| 1945 } | 1924 } |
| 1946 } | 1925 } |
| 1947 | 1926 |
| 1948 | 1927 |
| 1949 | |
| 1950 //------------------------------------------------------------------------------ | 1928 //------------------------------------------------------------------------------ |
| 1951 // | 1929 // |
| 1930 // allocateData() Allocate storage in the matcher's static data area. |
| 1931 // Return the index for the newly allocated data. |
| 1932 // The storage won't actually exist until we are running
a match |
| 1933 // operation, but the storage indexes are inserted into
various |
| 1934 // opcodes while compiling the pattern. |
| 1935 // |
| 1936 //------------------------------------------------------------------------------ |
| 1937 int32_t RegexCompile::allocateData(int32_t size) { |
| 1938 if (U_FAILURE(*fStatus)) { |
| 1939 return 0; |
| 1940 } |
| 1941 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { |
| 1942 error(U_REGEX_INTERNAL_ERROR); |
| 1943 return 0; |
| 1944 } |
| 1945 int32_t dataIndex = fRXPat->fDataSize; |
| 1946 fRXPat->fDataSize += size; |
| 1947 if (fRXPat->fDataSize >= 0x00fffff0) { |
| 1948 error(U_REGEX_INTERNAL_ERROR); |
| 1949 } |
| 1950 return dataIndex; |
| 1951 } |
| 1952 |
| 1953 |
| 1954 //------------------------------------------------------------------------------ |
| 1955 // |
| 1956 // allocateStackData() Allocate space in the back-tracking stack frame. |
| 1957 // Return the index for the newly allocated data. |
| 1958 // The frame indexes are inserted into various |
| 1959 // opcodes while compiling the pattern, meaning that fra
me |
| 1960 // size must be restricted to the size that will fit |
| 1961 // as an operand (24 bits). |
| 1962 // |
| 1963 //------------------------------------------------------------------------------ |
| 1964 int32_t RegexCompile::allocateStackData(int32_t size) { |
| 1965 if (U_FAILURE(*fStatus)) { |
| 1966 return 0; |
| 1967 } |
| 1968 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { |
| 1969 error(U_REGEX_INTERNAL_ERROR); |
| 1970 return 0; |
| 1971 } |
| 1972 int32_t dataIndex = fRXPat->fFrameSize; |
| 1973 fRXPat->fFrameSize += size; |
| 1974 if (fRXPat->fFrameSize >= 0x00fffff0) { |
| 1975 error(U_REGEX_PATTERN_TOO_BIG); |
| 1976 } |
| 1977 return dataIndex; |
| 1978 } |
| 1979 |
| 1980 |
| 1981 //------------------------------------------------------------------------------ |
| 1982 // |
| 1952 // blockTopLoc() Find or create a location in the compiled pattern | 1983 // blockTopLoc() Find or create a location in the compiled pattern |
| 1953 // at the start of the operation or block that has | 1984 // at the start of the operation or block that has |
| 1954 // just been compiled. Needed when a quantifier (* or | 1985 // just been compiled. Needed when a quantifier (* or |
| 1955 // whatever) appears, and we need to add an operation | 1986 // whatever) appears, and we need to add an operation |
| 1956 // at the start of the thing being quantified. | 1987 // at the start of the thing being quantified. |
| 1957 // | 1988 // |
| 1958 // (Parenthesized Blocks) have a slot with a NOP that | 1989 // (Parenthesized Blocks) have a slot with a NOP that |
| 1959 // is reserved for this purpose. .* or similar don't | 1990 // is reserved for this purpose. .* or similar don't |
| 1960 // and a slot needs to be added. | 1991 // and a slot needs to be added. |
| 1961 // | 1992 // |
| (...skipping 19 matching lines...) Expand all Loading... |
| 1981 // No slot for STATE_SAVE was pre-reserved in the compiled code. | 2012 // No slot for STATE_SAVE was pre-reserved in the compiled code. |
| 1982 // We need to make space now. | 2013 // We need to make space now. |
| 1983 theLoc = fRXPat->fCompiledPat->size()-1; | 2014 theLoc = fRXPat->fCompiledPat->size()-1; |
| 1984 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); | 2015 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); |
| 1985 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { | 2016 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { |
| 1986 // Strings take two opcode, we want the position of the first one. | 2017 // Strings take two opcode, we want the position of the first one. |
| 1987 // We can have a string at this point if a single character case-fol
ded to two. | 2018 // We can have a string at this point if a single character case-fol
ded to two. |
| 1988 theLoc--; | 2019 theLoc--; |
| 1989 } | 2020 } |
| 1990 if (reserveLoc) { | 2021 if (reserveLoc) { |
| 1991 int32_t nop = URX_BUILD(URX_NOP, 0); | 2022 int32_t nop = buildOp(URX_NOP, 0); |
| 1992 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); | 2023 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); |
| 1993 } | 2024 } |
| 1994 } | 2025 } |
| 1995 return theLoc; | 2026 return theLoc; |
| 1996 } | 2027 } |
| 1997 | 2028 |
| 1998 | 2029 |
| 1999 | 2030 |
| 2000 //------------------------------------------------------------------------------ | 2031 //------------------------------------------------------------------------------ |
| 2001 // | 2032 // |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2056 case capturing: | 2087 case capturing: |
| 2057 // Capturing Parentheses. | 2088 // Capturing Parentheses. |
| 2058 // Insert a End Capture op into the pattern. | 2089 // Insert a End Capture op into the pattern. |
| 2059 // The frame offset of the variables for this cg is obtained from the | 2090 // The frame offset of the variables for this cg is obtained from the |
| 2060 // start capture op and put it into the end-capture op. | 2091 // start capture op and put it into the end-capture op. |
| 2061 { | 2092 { |
| 2062 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); | 2093 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); |
| 2063 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | 2094 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
| 2064 | 2095 |
| 2065 int32_t frameVarLocation = URX_VAL(captureOp); | 2096 int32_t frameVarLocation = URX_VAL(captureOp); |
| 2066 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); | 2097 appendOp(URX_END_CAPTURE, frameVarLocation); |
| 2067 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); | |
| 2068 } | 2098 } |
| 2069 break; | 2099 break; |
| 2070 case atomic: | 2100 case atomic: |
| 2071 // Atomic Parenthesis. | 2101 // Atomic Parenthesis. |
| 2072 // Insert a LD_SP operation to restore the state stack to the position | 2102 // Insert a LD_SP operation to restore the state stack to the position |
| 2073 // it was when the atomic parens were entered. | 2103 // it was when the atomic parens were entered. |
| 2074 { | 2104 { |
| 2075 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); | 2105 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); |
| 2076 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | 2106 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); |
| 2077 int32_t stoLoc = URX_VAL(stoOp); | 2107 int32_t stoLoc = URX_VAL(stoOp); |
| 2078 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); | 2108 appendOp(URX_LD_SP, stoLoc); |
| 2079 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); | |
| 2080 } | 2109 } |
| 2081 break; | 2110 break; |
| 2082 | 2111 |
| 2083 case lookAhead: | 2112 case lookAhead: |
| 2084 { | 2113 { |
| 2085 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2114 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
| 2086 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2115 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 2087 int32_t dataLoc = URX_VAL(startOp); | 2116 int32_t dataLoc = URX_VAL(startOp); |
| 2088 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2117 appendOp(URX_LA_END, dataLoc); |
| 2089 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2090 } | 2118 } |
| 2091 break; | 2119 break; |
| 2092 | 2120 |
| 2093 case negLookAhead: | 2121 case negLookAhead: |
| 2094 { | 2122 { |
| 2095 // See comment at doOpenLookAheadNeg | 2123 // See comment at doOpenLookAheadNeg |
| 2096 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); | 2124 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); |
| 2097 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2125 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 2098 int32_t dataLoc = URX_VAL(startOp); | 2126 int32_t dataLoc = URX_VAL(startOp); |
| 2099 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2127 appendOp(URX_LA_END, dataLoc); |
| 2100 fRXPat->fCompiledPat->addElement(op, *fStatus); | 2128 appendOp(URX_BACKTRACK, 0); |
| 2101 op = URX_BUILD(URX_BACKTRACK, 0); | 2129 appendOp(URX_LA_END, dataLoc); |
| 2102 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2103 op = URX_BUILD(URX_LA_END, dataLoc); | |
| 2104 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2105 | 2130 |
| 2106 // Patch the URX_SAVE near the top of the block. | 2131 // Patch the URX_SAVE near the top of the block. |
| 2107 // The destination of the SAVE is the final LA_END that was just add
ed. | 2132 // The destination of the SAVE is the final LA_END that was just add
ed. |
| 2108 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); | 2133 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); |
| 2109 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | 2134 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); |
| 2110 int32_t dest = fRXPat->fCompiledPat->size()-1; | 2135 int32_t dest = fRXPat->fCompiledPat->size()-1; |
| 2111 saveOp = URX_BUILD(URX_STATE_SAVE, dest); | 2136 saveOp = buildOp(URX_STATE_SAVE, dest); |
| 2112 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | 2137 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); |
| 2113 } | 2138 } |
| 2114 break; | 2139 break; |
| 2115 | 2140 |
| 2116 case lookBehind: | 2141 case lookBehind: |
| 2117 { | 2142 { |
| 2118 // See comment at doOpenLookBehind. | 2143 // See comment at doOpenLookBehind. |
| 2119 | 2144 |
| 2120 // Append the URX_LB_END and URX_LA_END to the compiled pattern. | 2145 // Append the URX_LB_END and URX_LA_END to the compiled pattern. |
| 2121 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); | 2146 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); |
| 2122 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2147 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 2123 int32_t dataLoc = URX_VAL(startOp); | 2148 int32_t dataLoc = URX_VAL(startOp); |
| 2124 int32_t op = URX_BUILD(URX_LB_END, dataLoc); | 2149 appendOp(URX_LB_END, dataLoc); |
| 2125 fRXPat->fCompiledPat->addElement(op, *fStatus); | 2150 appendOp(URX_LA_END, dataLoc); |
| 2126 op = URX_BUILD(URX_LA_END, dataLoc); | |
| 2127 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2128 | 2151 |
| 2129 // Determine the min and max bounds for the length of the | 2152 // Determine the min and max bounds for the length of the |
| 2130 // string that the pattern can match. | 2153 // string that the pattern can match. |
| 2131 // An unbounded upper limit is an error. | 2154 // An unbounded upper limit is an error. |
| 2132 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2155 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
| 2133 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2156 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
| 2134 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2157 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2158 if (URX_TYPE(maxML) != 0) { |
| 2159 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2160 break; |
| 2161 } |
| 2135 if (maxML == INT32_MAX) { | 2162 if (maxML == INT32_MAX) { |
| 2136 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2163 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2137 break; | 2164 break; |
| 2138 } | 2165 } |
| 2139 U_ASSERT(minML <= maxML); | 2166 U_ASSERT(minML <= maxML); |
| 2140 | 2167 |
| 2141 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2168 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
| 2142 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2169 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
| 2143 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); | 2170 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); |
| 2144 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); | 2171 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); |
| 2145 | 2172 |
| 2146 } | 2173 } |
| 2147 break; | 2174 break; |
| 2148 | 2175 |
| 2149 | 2176 |
| 2150 | 2177 |
| 2151 case lookBehindN: | 2178 case lookBehindN: |
| 2152 { | 2179 { |
| 2153 // See comment at doOpenLookBehindNeg. | 2180 // See comment at doOpenLookBehindNeg. |
| 2154 | 2181 |
| 2155 // Append the URX_LBN_END to the compiled pattern. | 2182 // Append the URX_LBN_END to the compiled pattern. |
| 2156 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2183 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
| 2157 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2184 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 2158 int32_t dataLoc = URX_VAL(startOp); | 2185 int32_t dataLoc = URX_VAL(startOp); |
| 2159 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); | 2186 appendOp(URX_LBN_END, dataLoc); |
| 2160 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2161 | 2187 |
| 2162 // Determine the min and max bounds for the length of the | 2188 // Determine the min and max bounds for the length of the |
| 2163 // string that the pattern can match. | 2189 // string that the pattern can match. |
| 2164 // An unbounded upper limit is an error. | 2190 // An unbounded upper limit is an error. |
| 2165 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2191 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
| 2166 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2192 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
| 2167 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2193 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2194 if (URX_TYPE(maxML) != 0) { |
| 2195 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2196 break; |
| 2197 } |
| 2168 if (maxML == INT32_MAX) { | 2198 if (maxML == INT32_MAX) { |
| 2169 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2199 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2170 break; | 2200 break; |
| 2171 } | 2201 } |
| 2172 U_ASSERT(minML <= maxML); | 2202 U_ASSERT(minML <= maxML); |
| 2173 | 2203 |
| 2174 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2204 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
| 2175 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2205 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
| 2176 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); | 2206 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); |
| 2177 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); | 2207 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); |
| 2178 | 2208 |
| 2179 // Insert the pattern location to continue at after a successful mat
ch | 2209 // Insert the pattern location to continue at after a successful mat
ch |
| 2180 // as the last operand of the URX_LBN_CONT | 2210 // as the last operand of the URX_LBN_CONT |
| 2181 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); | 2211 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); |
| 2182 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); | 2212 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); |
| 2183 } | 2213 } |
| 2184 break; | 2214 break; |
| 2185 | 2215 |
| 2186 | 2216 |
| 2187 | 2217 |
| 2188 default: | 2218 default: |
| 2189 U_ASSERT(FALSE); | 2219 U_ASSERT(FALSE); |
| 2190 } | 2220 } |
| 2191 | 2221 |
| (...skipping 20 matching lines...) Expand all Loading... |
| 2212 // There shoudn't be any, but just in case. | 2242 // There shoudn't be any, but just in case. |
| 2213 // (Case Closure can add them; if we had a simple case closure avaialble
that | 2243 // (Case Closure can add them; if we had a simple case closure avaialble
that |
| 2214 // ignored strings, that would be better.) | 2244 // ignored strings, that would be better.) |
| 2215 theSet->removeAllStrings(); | 2245 theSet->removeAllStrings(); |
| 2216 int32_t setSize = theSet->size(); | 2246 int32_t setSize = theSet->size(); |
| 2217 | 2247 |
| 2218 switch (setSize) { | 2248 switch (setSize) { |
| 2219 case 0: | 2249 case 0: |
| 2220 { | 2250 { |
| 2221 // Set of no elements. Always fails to match. | 2251 // Set of no elements. Always fails to match. |
| 2222 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStat
us); | 2252 appendOp(URX_BACKTRACK, 0); |
| 2223 delete theSet; | 2253 delete theSet; |
| 2224 } | 2254 } |
| 2225 break; | 2255 break; |
| 2226 | 2256 |
| 2227 case 1: | 2257 case 1: |
| 2228 { | 2258 { |
| 2229 // The set contains only a single code point. Put it into | 2259 // The set contains only a single code point. Put it into |
| 2230 // the compiled pattern as a single char operation rather | 2260 // the compiled pattern as a single char operation rather |
| 2231 // than a set, and discard the set itself. | 2261 // than a set, and discard the set itself. |
| 2232 literalChar(theSet->charAt(0)); | 2262 literalChar(theSet->charAt(0)); |
| 2233 delete theSet; | 2263 delete theSet; |
| 2234 } | 2264 } |
| 2235 break; | 2265 break; |
| 2236 | 2266 |
| 2237 default: | 2267 default: |
| 2238 { | 2268 { |
| 2239 // The set contains two or more chars. (the normal case) | 2269 // The set contains two or more chars. (the normal case) |
| 2240 // Put it into the compiled pattern as a set. | 2270 // Put it into the compiled pattern as a set. |
| 2241 int32_t setNumber = fRXPat->fSets->size(); | 2271 int32_t setNumber = fRXPat->fSets->size(); |
| 2242 fRXPat->fSets->addElement(theSet, *fStatus); | 2272 fRXPat->fSets->addElement(theSet, *fStatus); |
| 2243 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | 2273 appendOp(URX_SETREF, setNumber); |
| 2244 fRXPat->fCompiledPat->addElement(setOp, *fStatus); | |
| 2245 } | 2274 } |
| 2246 } | 2275 } |
| 2247 } | 2276 } |
| 2248 | 2277 |
| 2249 | 2278 |
| 2250 //------------------------------------------------------------------------------ | 2279 //------------------------------------------------------------------------------ |
| 2251 // | 2280 // |
| 2252 // compileInterval Generate the code for a {min, max} style interval quanti
fier. | 2281 // compileInterval Generate the code for a {min, max} style interval quanti
fier. |
| 2253 // Except for the specific opcodes used, the code is the sa
me | 2282 // Except for the specific opcodes used, the code is the sa
me |
| 2254 // for all three types (greedy, non-greedy, possessive) of | 2283 // for all three types (greedy, non-greedy, possessive) of |
| (...skipping 18 matching lines...) Expand all Loading... |
| 2273 int32_t topOfBlock = blockTopLoc(TRUE); | 2302 int32_t topOfBlock = blockTopLoc(TRUE); |
| 2274 insertOp(topOfBlock); | 2303 insertOp(topOfBlock); |
| 2275 insertOp(topOfBlock); | 2304 insertOp(topOfBlock); |
| 2276 insertOp(topOfBlock); | 2305 insertOp(topOfBlock); |
| 2277 | 2306 |
| 2278 // The operands for the CTR_INIT opcode include the index in the matcher dat
a | 2307 // The operands for the CTR_INIT opcode include the index in the matcher dat
a |
| 2279 // of the counter. Allocate it now. There are two data items | 2308 // of the counter. Allocate it now. There are two data items |
| 2280 // counterLoc --> Loop counter | 2309 // counterLoc --> Loop counter |
| 2281 // +1 --> Input index (for breaking non-progressing loops) | 2310 // +1 --> Input index (for breaking non-progressing loops) |
| 2282 // (Only present if unbounded upper limit on loop) | 2311 // (Only present if unbounded upper limit on loop) |
| 2283 int32_t counterLoc = fRXPat->fFrameSize; | 2312 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; |
| 2284 fRXPat->fFrameSize++; | 2313 int32_t counterLoc = allocateStackData(dataSize); |
| 2285 if (fIntervalUpper < 0) { | |
| 2286 fRXPat->fFrameSize++; | |
| 2287 } | |
| 2288 | 2314 |
| 2289 int32_t op = URX_BUILD(InitOp, counterLoc); | 2315 int32_t op = buildOp(InitOp, counterLoc); |
| 2290 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | 2316 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); |
| 2291 | 2317 |
| 2292 // The second operand of CTR_INIT is the location following the end of the l
oop. | 2318 // The second operand of CTR_INIT is the location following the end of the l
oop. |
| 2293 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the | 2319 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the |
| 2294 // compilation of something later on causes the code to grow and the targe
t | 2320 // compilation of something later on causes the code to grow and the targe
t |
| 2295 // position to move. | 2321 // position to move. |
| 2296 int32_t loopEnd = fRXPat->fCompiledPat->size(); | 2322 int32_t loopEnd = fRXPat->fCompiledPat->size(); |
| 2297 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | 2323 op = buildOp(URX_RELOC_OPRND, loopEnd); |
| 2298 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | 2324 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); |
| 2299 | 2325 |
| 2300 // Followed by the min and max counts. | 2326 // Followed by the min and max counts. |
| 2301 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); | 2327 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); |
| 2302 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); | 2328 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); |
| 2303 | 2329 |
| 2304 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. | 2330 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. |
| 2305 // Goes at end of the block being looped over, so just append to the code
so far. | 2331 // Goes at end of the block being looped over, so just append to the code
so far. |
| 2306 op = URX_BUILD(LoopOp, topOfBlock); | 2332 appendOp(LoopOp, topOfBlock); |
| 2307 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2308 | 2333 |
| 2309 if ((fIntervalLow & 0xff000000) != 0 || | 2334 if ((fIntervalLow & 0xff000000) != 0 || |
| 2310 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { | 2335 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { |
| 2311 error(U_REGEX_NUMBER_TOO_BIG); | 2336 error(U_REGEX_NUMBER_TOO_BIG); |
| 2312 } | 2337 } |
| 2313 | 2338 |
| 2314 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { | 2339 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { |
| 2315 error(U_REGEX_MAX_LT_MIN); | 2340 error(U_REGEX_MAX_LT_MIN); |
| 2316 } | 2341 } |
| 2317 } | 2342 } |
| 2318 | 2343 |
| 2319 | 2344 |
| 2320 | 2345 |
| 2321 UBool RegexCompile::compileInlineInterval() { | 2346 UBool RegexCompile::compileInlineInterval() { |
| 2322 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { | 2347 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { |
| 2323 // Too big to inline. Fail, which will cause looping code to be generat
ed. | 2348 // Too big to inline. Fail, which will cause looping code to be generat
ed. |
| 2324 // (Upper < Lower picks up unbounded upper and errors, both.) | 2349 // (Upper < Lower picks up unbounded upper and errors, both.) |
| 2325 return FALSE; | 2350 return FALSE; |
| 2326 } | 2351 } |
| 2327 | 2352 |
| 2328 int32_t topOfBlock = blockTopLoc(FALSE); | 2353 int32_t topOfBlock = blockTopLoc(FALSE); |
| 2329 if (fIntervalUpper == 0) { | 2354 if (fIntervalUpper == 0) { |
| 2330 // Pathological case. Attempt no matches, as if the block doesn't exist
. | 2355 // Pathological case. Attempt no matches, as if the block doesn't exist
. |
| 2356 // Discard the generated code for the block. |
| 2357 // If the block included parens, discard the info pertaining to them as
well. |
| 2331 fRXPat->fCompiledPat->setSize(topOfBlock); | 2358 fRXPat->fCompiledPat->setSize(topOfBlock); |
| 2359 if (fMatchOpenParen >= topOfBlock) { |
| 2360 fMatchOpenParen = -1; |
| 2361 } |
| 2362 if (fMatchCloseParen >= topOfBlock) { |
| 2363 fMatchCloseParen = -1; |
| 2364 } |
| 2332 return TRUE; | 2365 return TRUE; |
| 2333 } | 2366 } |
| 2334 | 2367 |
| 2335 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { | 2368 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { |
| 2336 // The thing being repeated is not a single op, but some | 2369 // The thing being repeated is not a single op, but some |
| 2337 // more complex block. Do it as a loop, not inlines. | 2370 // more complex block. Do it as a loop, not inlines. |
| 2338 // Note that things "repeated" a max of once are handled as inline, be
cause | 2371 // Note that things "repeated" a max of once are handled as inline, be
cause |
| 2339 // the one copy of the code already generated is just fine. | 2372 // the one copy of the code already generated is just fine. |
| 2340 return FALSE; | 2373 return FALSE; |
| 2341 } | 2374 } |
| 2342 | 2375 |
| 2343 // Pick up the opcode that is to be repeated | 2376 // Pick up the opcode that is to be repeated |
| 2344 // | 2377 // |
| 2345 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); | 2378 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); |
| 2346 | 2379 |
| 2347 // Compute the pattern location where the inline sequence | 2380 // Compute the pattern location where the inline sequence |
| 2348 // will end, and set up the state save op that will be needed. | 2381 // will end, and set up the state save op that will be needed. |
| 2349 // | 2382 // |
| 2350 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 | 2383 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 |
| 2351 + fIntervalUpper + (fIntervalUpper-fIntervalLow)
; | 2384 + fIntervalUpper + (fIntervalUpper-fIntervalLow)
; |
| 2352 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); | 2385 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); |
| 2353 if (fIntervalLow == 0) { | 2386 if (fIntervalLow == 0) { |
| 2354 insertOp(topOfBlock); | 2387 insertOp(topOfBlock); |
| 2355 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); | 2388 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); |
| 2356 } | 2389 } |
| 2357 | 2390 |
| 2358 | 2391 |
| 2359 | 2392 |
| 2360 // Loop, emitting the op for the thing being repeated each time. | 2393 // Loop, emitting the op for the thing being repeated each time. |
| 2361 // Loop starts at 1 because one instance of the op already exists in the
pattern, | 2394 // Loop starts at 1 because one instance of the op already exists in the
pattern, |
| 2362 // it was put there when it was originally encountered. | 2395 // it was put there when it was originally encountered. |
| 2363 int32_t i; | 2396 int32_t i; |
| 2364 for (i=1; i<fIntervalUpper; i++ ) { | 2397 for (i=1; i<fIntervalUpper; i++ ) { |
| 2365 if (i == fIntervalLow) { | 2398 if (i >= fIntervalLow) { |
| 2366 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | 2399 appendOp(saveOp); |
| 2367 } | 2400 } |
| 2368 if (i > fIntervalLow) { | 2401 appendOp(op); |
| 2369 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | |
| 2370 } | |
| 2371 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 2372 } | 2402 } |
| 2373 return TRUE; | 2403 return TRUE; |
| 2374 } | 2404 } |
| 2375 | 2405 |
| 2376 | 2406 |
| 2377 | 2407 |
| 2378 //------------------------------------------------------------------------------ | 2408 //------------------------------------------------------------------------------ |
| 2379 // | 2409 // |
| 2380 // caseInsensitiveStart given a single code point from a pattern string, dete
rmine the | 2410 // caseInsensitiveStart given a single code point from a pattern string, dete
rmine the |
| 2381 // set of characters that could potentially begin a case
-insensitive | 2411 // set of characters that could potentially begin a case
-insensitive |
| (...skipping 1198 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3580 case URX_CTR_LOOP_NG: | 3610 case URX_CTR_LOOP_NG: |
| 3581 case URX_RELOC_OPRND: | 3611 case URX_RELOC_OPRND: |
| 3582 case URX_JMPX: | 3612 case URX_JMPX: |
| 3583 case URX_JMP_SAV: | 3613 case URX_JMP_SAV: |
| 3584 case URX_JMP_SAV_X: | 3614 case URX_JMP_SAV_X: |
| 3585 // These are instructions with operands that refer to code locations
. | 3615 // These are instructions with operands that refer to code locations
. |
| 3586 { | 3616 { |
| 3587 int32_t operandAddress = URX_VAL(op); | 3617 int32_t operandAddress = URX_VAL(op); |
| 3588 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); | 3618 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); |
| 3589 int32_t fixedOperandAddress = operandAddress - deltas.elementAti
(operandAddress); | 3619 int32_t fixedOperandAddress = operandAddress - deltas.elementAti
(operandAddress); |
| 3590 op = URX_BUILD(opType, fixedOperandAddress); | 3620 op = buildOp(opType, fixedOperandAddress); |
| 3591 fRXPat->fCompiledPat->setElementAt(op, dst); | 3621 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3592 dst++; | 3622 dst++; |
| 3593 break; | 3623 break; |
| 3594 } | 3624 } |
| 3595 | 3625 |
| 3596 case URX_BACKREF: | 3626 case URX_BACKREF: |
| 3597 case URX_BACKREF_I: | 3627 case URX_BACKREF_I: |
| 3598 { | 3628 { |
| 3599 int32_t where = URX_VAL(op); | 3629 int32_t where = URX_VAL(op); |
| 3600 if (where > fRXPat->fGroupMap->size()) { | 3630 if (where > fRXPat->fGroupMap->size()) { |
| 3601 error(U_REGEX_INVALID_BACK_REF); | 3631 error(U_REGEX_INVALID_BACK_REF); |
| 3602 break; | 3632 break; |
| 3603 } | 3633 } |
| 3604 where = fRXPat->fGroupMap->elementAti(where-1); | 3634 where = fRXPat->fGroupMap->elementAti(where-1); |
| 3605 op = URX_BUILD(opType, where); | 3635 op = buildOp(opType, where); |
| 3606 fRXPat->fCompiledPat->setElementAt(op, dst); | 3636 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 3607 dst++; | 3637 dst++; |
| 3608 | 3638 |
| 3609 fRXPat->fNeedsAltInput = TRUE; | 3639 fRXPat->fNeedsAltInput = TRUE; |
| 3610 break; | 3640 break; |
| 3611 } | 3641 } |
| 3612 case URX_RESERVED_OP: | 3642 case URX_RESERVED_OP: |
| 3613 case URX_RESERVED_OP_N: | 3643 case URX_RESERVED_OP_N: |
| 3614 case URX_BACKTRACK: | 3644 case URX_BACKTRACK: |
| 3615 case URX_END: | 3645 case URX_END: |
| (...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3947 fEOLComments = TRUE; | 3977 fEOLComments = TRUE; |
| 3948 | 3978 |
| 3949 // putc(c.fChar, stdout); | 3979 // putc(c.fChar, stdout); |
| 3950 } | 3980 } |
| 3951 | 3981 |
| 3952 | 3982 |
| 3953 | 3983 |
| 3954 //------------------------------------------------------------------------------ | 3984 //------------------------------------------------------------------------------ |
| 3955 // | 3985 // |
| 3956 // scanNamedChar | 3986 // scanNamedChar |
| 3957 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. | 3987 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. |
| 3958 // | 3988 // |
| 3959 // The scan position will be at the 'N'. On return | 3989 // The scan position will be at the 'N'. On return |
| 3960 // the scan position should be just after the '}' | 3990 // the scan position should be just after the '}' |
| 3961 // | 3991 // |
| 3962 // Return the UChar32 | 3992 // Return the UChar32 |
| 3963 // | 3993 // |
| 3964 //------------------------------------------------------------------------------ | 3994 //------------------------------------------------------------------------------ |
| 3965 UChar32 RegexCompile::scanNamedChar() { | 3995 UChar32 RegexCompile::scanNamedChar() { |
| 3966 if (U_FAILURE(*fStatus)) { | 3996 if (U_FAILURE(*fStatus)) { |
| 3967 return 0; | 3997 return 0; |
| (...skipping 451 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4419 | 4449 |
| 4420 void RegexCompile::setPushOp(int32_t op) { | 4450 void RegexCompile::setPushOp(int32_t op) { |
| 4421 setEval(op); | 4451 setEval(op); |
| 4422 fSetOpStack.push(op, *fStatus); | 4452 fSetOpStack.push(op, *fStatus); |
| 4423 fSetStack.push(new UnicodeSet(), *fStatus); | 4453 fSetStack.push(new UnicodeSet(), *fStatus); |
| 4424 } | 4454 } |
| 4425 | 4455 |
| 4426 U_NAMESPACE_END | 4456 U_NAMESPACE_END |
| 4427 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 4457 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 4428 | 4458 |
| OLD | NEW |