OLD | NEW |
1 // | 1 // |
2 // file: regexcmp.cpp | 2 // file: regexcmp.cpp |
3 // | 3 // |
4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. | 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. |
5 // All Rights Reserved. | 5 // All Rights Reserved. |
6 // | 6 // |
7 // This file contains the ICU regular expression compiler, which is responsible | 7 // This file contains the ICU regular expression compiler, which is responsible |
8 // for processing a regular expression pattern into the compiled form that | 8 // for processing a regular expression pattern into the compiled form that |
9 // is used by the match finding engine. | 9 // is used by the match finding engine. |
10 // | 10 // |
(...skipping 283 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
294 n *= 10; | 294 n *= 10; |
295 } | 295 } |
296 | 296 |
297 // | 297 // |
298 // The pattern's fFrameSize so far has accumulated the requirements for | 298 // The pattern's fFrameSize so far has accumulated the requirements for |
299 // storage for capture parentheses, counters, etc. that are encountered | 299 // storage for capture parentheses, counters, etc. that are encountered |
300 // in the pattern. Add space for the two variables that are always | 300 // in the pattern. Add space for the two variables that are always |
301 // present in the saved state: the input string position (int64_t) and | 301 // present in the saved state: the input string position (int64_t) and |
302 // the position in the compiled pattern. | 302 // the position in the compiled pattern. |
303 // | 303 // |
304 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; | 304 allocateStackData(RESTACKFRAME_HDRCOUNT); |
305 | 305 |
306 // | 306 // |
307 // Optimization pass 1: NOPs, back-references, and case-folding | 307 // Optimization pass 1: NOPs, back-references, and case-folding |
308 // | 308 // |
309 stripNOPs(); | 309 stripNOPs(); |
310 | 310 |
311 // | 311 // |
312 // Get bounds for the minimum and maximum length of a string that this | 312 // Get bounds for the minimum and maximum length of a string that this |
313 // pattern can match. Used to avoid looking for matches in strings that | 313 // pattern can match. Used to avoid looking for matches in strings that |
314 // are too short. | 314 // are too short. |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
360 | 360 |
361 case doPatStart: | 361 case doPatStart: |
362 // Start of pattern compiles to: | 362 // Start of pattern compiles to: |
363 //0 SAVE 2 Fall back to position of FAIL | 363 //0 SAVE 2 Fall back to position of FAIL |
364 //1 jmp 3 | 364 //1 jmp 3 |
365 //2 FAIL Stop if we ever reach here. | 365 //2 FAIL Stop if we ever reach here. |
366 //3 NOP Dummy, so start of pattern looks the same as | 366 //3 NOP Dummy, so start of pattern looks the same as |
367 // the start of an ( grouping. | 367 // the start of an ( grouping. |
368 //4 NOP Resreved, will be replaced by a save if there are | 368 //4 NOP Resreved, will be replaced by a save if there are |
369 // OR | operators at the top level | 369 // OR | operators at the top level |
370 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus)
; | 370 appendOp(URX_STATE_SAVE, 2); |
371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); | 371 appendOp(URX_JMP, 3); |
372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); | 372 appendOp(URX_FAIL, 0); |
373 | 373 |
374 // Standard open nonCapture paren action emits the two NOPs and | 374 // Standard open nonCapture paren action emits the two NOPs and |
375 // sets up the paren stack frame. | 375 // sets up the paren stack frame. |
376 doParseActions(doOpenNonCaptureParen); | 376 doParseActions(doOpenNonCaptureParen); |
377 break; | 377 break; |
378 | 378 |
379 case doPatFinish: | 379 case doPatFinish: |
380 // We've scanned to the end of the pattern | 380 // We've scanned to the end of the pattern |
381 // The end of pattern compiles to: | 381 // The end of pattern compiles to: |
382 // URX_END | 382 // URX_END |
383 // which will stop the runtime match engine. | 383 // which will stop the runtime match engine. |
384 // Encountering end of pattern also behaves like a close paren, | 384 // Encountering end of pattern also behaves like a close paren, |
385 // and forces fixups of the State Save at the beginning of the compile
d pattern | 385 // and forces fixups of the State Save at the beginning of the compile
d pattern |
386 // and of any OR operations at the top level. | 386 // and of any OR operations at the top level. |
387 // | 387 // |
388 handleCloseParen(); | 388 handleCloseParen(); |
389 if (fParenStack.size() > 0) { | 389 if (fParenStack.size() > 0) { |
390 // Missing close paren in pattern. | 390 // Missing close paren in pattern. |
391 error(U_REGEX_MISMATCHED_PAREN); | 391 error(U_REGEX_MISMATCHED_PAREN); |
392 } | 392 } |
393 | 393 |
394 // add the END operation to the compiled pattern. | 394 // add the END operation to the compiled pattern. |
395 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); | 395 appendOp(URX_END, 0); |
396 | 396 |
397 // Terminate the pattern compilation state machine. | 397 // Terminate the pattern compilation state machine. |
398 returnVal = FALSE; | 398 returnVal = FALSE; |
399 break; | 399 break; |
400 | 400 |
401 | 401 |
402 | 402 |
403 case doOrOperator: | 403 case doOrOperator: |
404 // Scanning a '|', as in (A|B) | 404 // Scanning a '|', as in (A|B) |
405 { | 405 { |
406 // Generate code for any pending literals preceding the '|' | 406 // Generate code for any pending literals preceding the '|' |
407 fixLiterals(FALSE); | 407 fixLiterals(FALSE); |
408 | 408 |
409 // Insert a SAVE operation at the start of the pattern section prece
ding | 409 // Insert a SAVE operation at the start of the pattern section prece
ding |
410 // this OR at this level. This SAVE will branch the match forward | 410 // this OR at this level. This SAVE will branch the match forward |
411 // to the right hand side of the OR in the event that the left han
d | 411 // to the right hand side of the OR in the event that the left han
d |
412 // side fails to match and backtracks. Locate the position for th
e | 412 // side fails to match and backtracks. Locate the position for th
e |
413 // save from the location on the top of the parentheses stack. | 413 // save from the location on the top of the parentheses stack. |
414 int32_t savePosition = fParenStack.popi(); | 414 int32_t savePosition = fParenStack.popi(); |
415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; | 415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; |
416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location | 416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location |
417 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | 417 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
418 fRXPat->fCompiledPat->setElementAt(op, savePosition); | 418 fRXPat->fCompiledPat->setElementAt(op, savePosition); |
419 | 419 |
420 // Append an JMP operation into the compiled pattern. The operand f
or | 420 // Append an JMP operation into the compiled pattern. The operand f
or |
421 // the JMP will eventually be the location following the ')' for th
e | 421 // the JMP will eventually be the location following the ')' for th
e |
422 // group. This will be patched in later, when the ')' is encounter
ed. | 422 // group. This will be patched in later, when the ')' is encounter
ed. |
423 op = URX_BUILD(URX_JMP, 0); | 423 appendOp(URX_JMP, 0); |
424 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
425 | 424 |
426 // Push the position of the newly added JMP op onto the parentheses
stack. | 425 // Push the position of the newly added JMP op onto the parentheses
stack. |
427 // This registers if for fixup when this block's close paren is enco
untered. | 426 // This registers if for fixup when this block's close paren is enco
untered. |
428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 427 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
429 | 428 |
430 // Append a NOP to the compiled pattern. This is the slot reserved | 429 // Append a NOP to the compiled pattern. This is the slot reserved |
431 // for a SAVE in the event that there is yet another '|' following | 430 // for a SAVE in the event that there is yet another '|' following |
432 // this one. | 431 // this one. |
433 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 432 appendOp(URX_NOP, 0); |
434 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 433 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
435 } | 434 } |
436 break; | 435 break; |
437 | 436 |
438 | 437 |
439 case doOpenCaptureParen: | 438 case doOpenCaptureParen: |
440 // Open Paren. | 439 // Open Paren. |
441 // Compile to a | 440 // Compile to a |
442 // - NOP, which later may be replaced by a save-state if the | 441 // - NOP, which later may be replaced by a save-state if the |
443 // parenthesized group gets a * quantifier, followed by | 442 // parenthesized group gets a * quantifier, followed by |
444 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. | 443 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
445 // - NOP, which may later be replaced by a save-state if there | 444 // - NOP, which may later be replaced by a save-state if there |
446 // is an '|' alternation within the parens. | 445 // is an '|' alternation within the parens. |
447 // | 446 // |
448 // Each capture group gets three slots in the save stack frame: | 447 // Each capture group gets three slots in the save stack frame: |
449 // 0: Capture Group start position (in input string being matche
d.) | 448 // 0: Capture Group start position (in input string being matche
d.) |
450 // 1: Capture Group end position. | 449 // 1: Capture Group end position. |
451 // 2: Start of Match-in-progress. | 450 // 2: Start of Match-in-progress. |
452 // The first two locations are for a completed capture group, and are | 451 // The first two locations are for a completed capture group, and are |
453 // referred to by back references and the like. | 452 // referred to by back references and the like. |
454 // The third location stores the capture start position when an START
_CAPTURE is | 453 // The third location stores the capture start position when an START
_CAPTURE is |
455 // encountered. This will be promoted to a completed capture when
(and if) the corresponding | 454 // encountered. This will be promoted to a completed capture when
(and if) the corresponding |
456 // END_CAPTURE is encountered. | 455 // END_CAPTURE is encountered. |
457 { | 456 { |
458 fixLiterals(); | 457 fixLiterals(); |
459 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 458 appendOp(URX_NOP, 0); |
460 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. | 459 int32_t varsLoc = allocateStackData(3); // Reserve three slots i
n match stack frame. |
461 fRXPat->fFrameSize += 3; | 460 appendOp(URX_START_CAPTURE, varsLoc); |
462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); | 461 appendOp(URX_NOP, 0); |
463 fRXPat->fCompiledPat->addElement(cop, *fStatus); | |
464 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
465 | 462 |
466 // On the Parentheses stack, start a new frame and add the postions | 463 // On the Parentheses stack, start a new frame and add the postions |
467 // of the two NOPs. Depending on what follows in the pattern, the | 464 // of the two NOPs. Depending on what follows in the pattern, the |
468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 465 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
469 // address of the end of the parenthesized group. | 466 // address of the end of the parenthesized group. |
470 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 467 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
471 fParenStack.push(capturing, *fStatus); // Fra
me type. | 468 fParenStack.push(capturing, *fStatus); // Fra
me type. |
472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location | 469 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 470 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
474 | 471 |
475 // Save the mapping from group number to stack frame variable positi
on. | 472 // Save the mapping from group number to stack frame variable positi
on. |
476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 473 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
477 } | 474 } |
478 break; | 475 break; |
479 | 476 |
480 case doOpenNonCaptureParen: | 477 case doOpenNonCaptureParen: |
481 // Open non-caputuring (grouping only) Paren. | 478 // Open non-caputuring (grouping only) Paren. |
482 // Compile to a | 479 // Compile to a |
483 // - NOP, which later may be replaced by a save-state if the | 480 // - NOP, which later may be replaced by a save-state if the |
484 // parenthesized group gets a * quantifier, followed by | 481 // parenthesized group gets a * quantifier, followed by |
485 // - NOP, which may later be replaced by a save-state if there | 482 // - NOP, which may later be replaced by a save-state if there |
486 // is an '|' alternation within the parens. | 483 // is an '|' alternation within the parens. |
487 { | 484 { |
488 fixLiterals(); | 485 fixLiterals(); |
489 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 486 appendOp(URX_NOP, 0); |
490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 487 appendOp(URX_NOP, 0); |
491 | 488 |
492 // On the Parentheses stack, start a new frame and add the postions | 489 // On the Parentheses stack, start a new frame and add the postions |
493 // of the two NOPs. | 490 // of the two NOPs. |
494 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 491 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
495 fParenStack.push(plain, *fStatus); // Beg
in a new frame. | 492 fParenStack.push(plain, *fStatus); // Beg
in a new frame. |
496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 493 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 494 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
498 } | 495 } |
499 break; | 496 break; |
500 | 497 |
501 | 498 |
502 case doOpenAtomicParen: | 499 case doOpenAtomicParen: |
503 // Open Atomic Paren. (?> | 500 // Open Atomic Paren. (?> |
504 // Compile to a | 501 // Compile to a |
505 // - NOP, which later may be replaced if the parenthesized group | 502 // - NOP, which later may be replaced if the parenthesized group |
506 // has a quantifier, followed by | 503 // has a quantifier, followed by |
507 // - STO_SP save state stack position, so it can be restored at th
e ")" | 504 // - STO_SP save state stack position, so it can be restored at th
e ")" |
508 // - NOP, which may later be replaced by a save-state if there | 505 // - NOP, which may later be replaced by a save-state if there |
509 // is an '|' alternation within the parens. | 506 // is an '|' alternation within the parens. |
510 { | 507 { |
511 fixLiterals(); | 508 fixLiterals(); |
512 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 509 appendOp(URX_NOP, 0); |
513 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the | 510 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the state stack ptr. |
514 fRXPat->fDataSize += 1; // state stack ptr. | 511 appendOp(URX_STO_SP, varLoc); |
515 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); | 512 appendOp(URX_NOP, 0); |
516 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); | |
517 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
518 | 513 |
519 // On the Parentheses stack, start a new frame and add the postions | 514 // On the Parentheses stack, start a new frame and add the postions |
520 // of the two NOPs. Depending on what follows in the pattern, the | 515 // of the two NOPs. Depending on what follows in the pattern, the |
521 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 516 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
522 // address of the end of the parenthesized group. | 517 // address of the end of the parenthesized group. |
523 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 518 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
524 fParenStack.push(atomic, *fStatus); // Fra
me type. | 519 fParenStack.push(atomic, *fStatus); // Fra
me type. |
525 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP | 520 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP |
526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 521 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
527 } | 522 } |
(...skipping 22 matching lines...) Expand all Loading... |
550 // 6. NOP reserved for use by quantifiers on the block
. | 545 // 6. NOP reserved for use by quantifiers on the block
. |
551 // Look-ahead can't have quantifiers, but paren
stack | 546 // Look-ahead can't have quantifiers, but paren
stack |
552 // compile time conventions require the slot
anyhow. | 547 // compile time conventions require the slot
anyhow. |
553 // 7. NOP may be replaced if there is are '|' ops in t
he block. | 548 // 7. NOP may be replaced if there is are '|' ops in t
he block. |
554 // 8. code for parenthesized stuff. | 549 // 8. code for parenthesized stuff. |
555 // 9. LA_END | 550 // 9. LA_END |
556 // | 551 // |
557 // Two data slots are reserved, for saving the stack ptr and the input
position. | 552 // Two data slots are reserved, for saving the stack ptr and the input
position. |
558 { | 553 { |
559 fixLiterals(); | 554 fixLiterals(); |
560 int32_t dataLoc = fRXPat->fDataSize; | 555 int32_t dataLoc = allocateData(2); |
561 fRXPat->fDataSize += 2; | 556 appendOp(URX_LA_START, dataLoc); |
562 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 557 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
563 fRXPat->fCompiledPat->addElement(op, *fStatus); | 558 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
564 | 559 appendOp(URX_LA_END, dataLoc); |
565 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | 560 appendOp(URX_BACKTRACK, 0); |
566 fRXPat->fCompiledPat->addElement(op, *fStatus); | 561 appendOp(URX_NOP, 0); |
567 | 562 appendOp(URX_NOP, 0); |
568 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | |
569 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
570 | |
571 op = URX_BUILD(URX_LA_END, dataLoc); | |
572 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
573 | |
574 op = URX_BUILD(URX_BACKTRACK, 0); | |
575 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
576 | |
577 op = URX_BUILD(URX_NOP, 0); | |
578 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
579 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
580 | 563 |
581 // On the Parentheses stack, start a new frame and add the postions | 564 // On the Parentheses stack, start a new frame and add the postions |
582 // of the NOPs. | 565 // of the NOPs. |
583 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 566 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
584 fParenStack.push(lookAhead, *fStatus); // Fra
me type. | 567 fParenStack.push(lookAhead, *fStatus); // Fra
me type. |
585 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 568 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
586 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 569 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
587 } | 570 } |
588 break; | 571 break; |
589 | 572 |
590 case doOpenLookAheadNeg: | 573 case doOpenLookAheadNeg: |
591 // Negated Lookahead. (?! stuff ) | 574 // Negated Lookahead. (?! stuff ) |
592 // Compiles to | 575 // Compiles to |
593 // 1. START_LA dataloc | 576 // 1. START_LA dataloc |
594 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, | 577 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, |
595 // // which continues with the match. | 578 // // which continues with the match. |
596 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' | 579 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' |
597 // 4. code for parenthesized stuff. | 580 // 4. code for parenthesized stuff. |
598 // 5. END_LA // Cut back stack, remove saved state
from step 2. | 581 // 5. END_LA // Cut back stack, remove saved state
from step 2. |
599 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. | 582 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. |
600 // 7. END_LA // Restore match region, in case look-
ahead was using | 583 // 7. END_LA // Restore match region, in case look-
ahead was using |
601 // an alternate (transparent) reg
ion. | 584 // an alternate (transparent) reg
ion. |
602 { | 585 { |
603 fixLiterals(); | 586 fixLiterals(); |
604 int32_t dataLoc = fRXPat->fDataSize; | 587 int32_t dataLoc = allocateData(2); |
605 fRXPat->fDataSize += 2; | 588 appendOp(URX_LA_START, dataLoc); |
606 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 589 appendOp(URX_STATE_SAVE, 0); // dest address will be patched late
r. |
607 fRXPat->fCompiledPat->addElement(op, *fStatus); | 590 appendOp(URX_NOP, 0); |
608 | |
609 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. | |
610 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
611 | |
612 op = URX_BUILD(URX_NOP, 0); | |
613 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
614 | 591 |
615 // On the Parentheses stack, start a new frame and add the postions | 592 // On the Parentheses stack, start a new frame and add the postions |
616 // of the StateSave and NOP. | 593 // of the StateSave and NOP. |
617 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 594 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
618 fParenStack.push(negLookAhead, *fStatus); // Fram
e type | 595 fParenStack.push(negLookAhead, *fStatus); // Fram
e type |
619 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location | 596 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location |
620 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 597 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
621 | 598 |
622 // Instructions #5 - #7 will be added when the ')' is encountered. | 599 // Instructions #5 - #7 will be added when the ')' is encountered. |
623 } | 600 } |
(...skipping 17 matching lines...) Expand all Loading... |
641 // Allocate a block of matcher data, to contain (when runni
ng a match) | 618 // Allocate a block of matcher data, to contain (when runni
ng a match) |
642 // 0: Stack ptr on entry | 619 // 0: Stack ptr on entry |
643 // 1: Input Index on entry | 620 // 1: Input Index on entry |
644 // 2: Start index of match current match attempt. | 621 // 2: Start index of match current match attempt. |
645 // 3: Original Input String len. | 622 // 3: Original Input String len. |
646 | 623 |
647 // Generate match code for any pending literals. | 624 // Generate match code for any pending literals. |
648 fixLiterals(); | 625 fixLiterals(); |
649 | 626 |
650 // Allocate data space | 627 // Allocate data space |
651 int32_t dataLoc = fRXPat->fDataSize; | 628 int32_t dataLoc = allocateData(4); |
652 fRXPat->fDataSize += 4; | |
653 | 629 |
654 // Emit URX_LB_START | 630 // Emit URX_LB_START |
655 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 631 appendOp(URX_LB_START, dataLoc); |
656 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
657 | 632 |
658 // Emit URX_LB_CONT | 633 // Emit URX_LB_CONT |
659 op = URX_BUILD(URX_LB_CONT, dataLoc); | 634 appendOp(URX_LB_CONT, dataLoc); |
660 fRXPat->fCompiledPat->addElement(op, *fStatus); | 635 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled la
ter. |
661 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. | 636 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled la
ter. |
662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. | |
663 | 637 |
664 // Emit the NOP | 638 // Emit the NOPs |
665 op = URX_BUILD(URX_NOP, 0); | 639 appendOp(URX_NOP, 0); |
666 fRXPat->fCompiledPat->addElement(op, *fStatus); | 640 appendOp(URX_NOP, 0); |
667 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
668 | 641 |
669 // On the Parentheses stack, start a new frame and add the postions | 642 // On the Parentheses stack, start a new frame and add the postions |
670 // of the URX_LB_CONT and the NOP. | 643 // of the URX_LB_CONT and the NOP. |
671 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 644 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
672 fParenStack.push(lookBehind, *fStatus); // Fra
me type | 645 fParenStack.push(lookBehind, *fStatus); // Fra
me type |
673 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 646 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
674 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 647 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
675 | 648 |
676 // The final two instructions will be added when the ')' is encounte
red. | 649 // The final two instructions will be added when the ')' is encounte
red. |
677 } | 650 } |
(...skipping 19 matching lines...) Expand all Loading... |
697 // Allocate a block of matcher data, to contain (when runni
ng a match) | 670 // Allocate a block of matcher data, to contain (when runni
ng a match) |
698 // 0: Stack ptr on entry | 671 // 0: Stack ptr on entry |
699 // 1: Input Index on entry | 672 // 1: Input Index on entry |
700 // 2: Start index of match current match attempt. | 673 // 2: Start index of match current match attempt. |
701 // 3: Original Input String len. | 674 // 3: Original Input String len. |
702 | 675 |
703 // Generate match code for any pending literals. | 676 // Generate match code for any pending literals. |
704 fixLiterals(); | 677 fixLiterals(); |
705 | 678 |
706 // Allocate data space | 679 // Allocate data space |
707 int32_t dataLoc = fRXPat->fDataSize; | 680 int32_t dataLoc = allocateData(4); |
708 fRXPat->fDataSize += 4; | |
709 | 681 |
710 // Emit URX_LB_START | 682 // Emit URX_LB_START |
711 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 683 appendOp(URX_LB_START, dataLoc); |
712 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
713 | 684 |
714 // Emit URX_LBN_CONT | 685 // Emit URX_LBN_CONT |
715 op = URX_BUILD(URX_LBN_CONT, dataLoc); | 686 appendOp(URX_LBN_CONT, dataLoc); |
716 fRXPat->fCompiledPat->addElement(op, *fStatus); | 687 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled la
ter. |
717 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. | 688 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled la
ter. |
718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. | 689 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled la
ter. |
719 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. | |
720 | 690 |
721 // Emit the NOP | 691 // Emit the NOPs |
722 op = URX_BUILD(URX_NOP, 0); | 692 appendOp(URX_NOP, 0); |
723 fRXPat->fCompiledPat->addElement(op, *fStatus); | 693 appendOp(URX_NOP, 0); |
724 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
725 | 694 |
726 // On the Parentheses stack, start a new frame and add the postions | 695 // On the Parentheses stack, start a new frame and add the postions |
727 // of the URX_LB_CONT and the NOP. | 696 // of the URX_LB_CONT and the NOP. |
728 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 697 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
729 fParenStack.push(lookBehindN, *fStatus); // Fra
me type | 698 fParenStack.push(lookBehindN, *fStatus); // Fra
me type |
730 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 699 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
731 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 700 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
732 | 701 |
733 // The final two instructions will be added when the ')' is encounte
red. | 702 // The final two instructions will be added when the ')' is encounte
red. |
734 } | 703 } |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
784 { | 753 { |
785 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 | 754 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 |
786 int32_t frameLoc; | 755 int32_t frameLoc; |
787 | 756 |
788 // Check for simple constructs, which may get special optimized code
. | 757 // Check for simple constructs, which may get special optimized code
. |
789 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 758 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
790 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 759 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
791 | 760 |
792 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 761 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
793 // Emit optimized code for [char set]+ | 762 // Emit optimized code for [char set]+ |
794 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 763 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); |
795 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | 764 frameLoc = allocateStackData(1); |
796 frameLoc = fRXPat->fFrameSize; | 765 appendOp(URX_LOOP_C, frameLoc); |
797 fRXPat->fFrameSize++; | |
798 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
799 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
800 break; | 766 break; |
801 } | 767 } |
802 | 768 |
803 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 769 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
804 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 770 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
805 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 771 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
806 // Emit Optimized code for .+ operations. | 772 // Emit Optimized code for .+ operations. |
807 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 773 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); |
808 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 774 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
809 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. | 775 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. |
810 loopOpI |= 1; | 776 loopOpI |= 1; |
811 } | 777 } |
812 if (fModeFlags & UREGEX_UNIX_LINES) { | 778 if (fModeFlags & UREGEX_UNIX_LINES) { |
813 loopOpI |= 2; | 779 loopOpI |= 2; |
814 } | 780 } |
815 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | 781 appendOp(loopOpI); |
816 frameLoc = fRXPat->fFrameSize; | 782 frameLoc = allocateStackData(1); |
817 fRXPat->fFrameSize++; | 783 appendOp(URX_LOOP_C, frameLoc); |
818 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
819 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
820 break; | 784 break; |
821 } | 785 } |
822 | 786 |
823 } | 787 } |
824 | 788 |
825 // General case. | 789 // General case. |
826 | 790 |
827 // Check for minimum match length of zero, which requires | 791 // Check for minimum match length of zero, which requires |
828 // extra loop-breaking code. | 792 // extra loop-breaking code. |
829 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { | 793 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { |
830 // Zero length match is possible. | 794 // Zero length match is possible. |
831 // Emit the code sequence that can handle it. | 795 // Emit the code sequence that can handle it. |
832 insertOp(topLoc); | 796 insertOp(topLoc); |
833 frameLoc = fRXPat->fFrameSize; | 797 frameLoc = allocateStackData(1); |
834 fRXPat->fFrameSize++; | |
835 | 798 |
836 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | 799 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); |
837 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 800 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
838 | 801 |
839 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | 802 appendOp(URX_JMP_SAV_X, topLoc+1); |
840 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
841 } else { | 803 } else { |
842 // Simpler code when the repeated body must match something non-
empty | 804 // Simpler code when the repeated body must match something non-
empty |
843 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); | 805 appendOp(URX_JMP_SAV, topLoc); |
844 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | |
845 } | 806 } |
846 } | 807 } |
847 break; | 808 break; |
848 | 809 |
849 case doNGPlus: | 810 case doNGPlus: |
850 // Non-greedy '+?' compiles to | 811 // Non-greedy '+?' compiles to |
851 // 1. stuff to be repeated (already built) | 812 // 1. stuff to be repeated (already built) |
852 // 2. state-save 1 | 813 // 2. state-save 1 |
853 // 3. ... | 814 // 3. ... |
854 { | 815 { |
855 int32_t topLoc = blockTopLoc(FALSE); | 816 int32_t topLoc = blockTopLoc(FALSE); |
856 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | 817 appendOp(URX_STATE_SAVE, topLoc); |
857 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); | |
858 } | 818 } |
859 break; | 819 break; |
860 | 820 |
861 | 821 |
862 case doOpt: | 822 case doOpt: |
863 // Normal (greedy) ? quantifier. | 823 // Normal (greedy) ? quantifier. |
864 // Compiles to | 824 // Compiles to |
865 // 1. state save 3 | 825 // 1. state save 3 |
866 // 2. body of optional block | 826 // 2. body of optional block |
867 // 3. ... | 827 // 3. ... |
868 // Insert the state save into the compiled pattern, and we're done. | 828 // Insert the state save into the compiled pattern, and we're done. |
869 { | 829 { |
870 int32_t saveStateLoc = blockTopLoc(TRUE); | 830 int32_t saveStateLoc = blockTopLoc(TRUE); |
871 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiled
Pat->size()); | 831 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPa
t->size()); |
872 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 832 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
873 } | 833 } |
874 break; | 834 break; |
875 | 835 |
876 case doNGOpt: | 836 case doNGOpt: |
877 // Non-greedy ?? quantifier | 837 // Non-greedy ?? quantifier |
878 // compiles to | 838 // compiles to |
879 // 1. jmp 4 | 839 // 1. jmp 4 |
880 // 2. body of optional block | 840 // 2. body of optional block |
881 // 3 jmp 5 | 841 // 3 jmp 5 |
882 // 4. state save 2 | 842 // 4. state save 2 |
883 // 5 ... | 843 // 5 ... |
884 // This code is less than ideal, with two jmps instead of one, because
we can only | 844 // This code is less than ideal, with two jmps instead of one, because
we can only |
885 // insert one instruction at the top of the block being iterated. | 845 // insert one instruction at the top of the block being iterated. |
886 { | 846 { |
887 int32_t jmp1_loc = blockTopLoc(TRUE); | 847 int32_t jmp1_loc = blockTopLoc(TRUE); |
888 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); | 848 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
889 | 849 |
890 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); | 850 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); |
891 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | 851 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
892 | 852 |
893 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); | 853 appendOp(URX_JMP, jmp2_loc+2); |
894 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); | |
895 | 854 |
896 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | 855 appendOp(URX_STATE_SAVE, jmp1_loc+1); |
897 fRXPat->fCompiledPat->addElement(save_op, *fStatus); | |
898 } | 856 } |
899 break; | 857 break; |
900 | 858 |
901 | 859 |
902 case doStar: | 860 case doStar: |
903 // Normal (greedy) * quantifier. | 861 // Normal (greedy) * quantifier. |
904 // Compiles to | 862 // Compiles to |
905 // 1. STATE_SAVE 4 | 863 // 1. STATE_SAVE 4 |
906 // 2. body of stuff being iterated over | 864 // 2. body of stuff being iterated over |
907 // 3. JMP_SAV 2 | 865 // 3. JMP_SAV 2 |
(...skipping 19 matching lines...) Expand all Loading... |
927 int32_t topLoc = blockTopLoc(FALSE); | 885 int32_t topLoc = blockTopLoc(FALSE); |
928 int32_t dataLoc = -1; | 886 int32_t dataLoc = -1; |
929 | 887 |
930 // Check for simple *, where the construct being repeated | 888 // Check for simple *, where the construct being repeated |
931 // compiled to single opcode, and might be optimizable. | 889 // compiled to single opcode, and might be optimizable. |
932 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 890 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 891 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
934 | 892 |
935 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 893 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
936 // Emit optimized code for a [char set]* | 894 // Emit optimized code for a [char set]* |
937 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 895 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)
); |
938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 896 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
939 dataLoc = fRXPat->fFrameSize; | 897 dataLoc = allocateStackData(1); |
940 fRXPat->fFrameSize++; | 898 appendOp(URX_LOOP_C, dataLoc); |
941 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
942 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
943 break; | 899 break; |
944 } | 900 } |
945 | 901 |
946 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 902 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
947 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 903 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
948 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 904 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
949 // Emit Optimized code for .* operations. | 905 // Emit Optimized code for .* operations. |
950 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 906 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); |
951 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 907 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
952 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. | 908 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. |
953 loopOpI |= 1; | 909 loopOpI |= 1; |
954 } | 910 } |
955 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { | 911 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { |
956 loopOpI |= 2; | 912 loopOpI |= 2; |
957 } | 913 } |
958 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 914 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
959 dataLoc = fRXPat->fFrameSize; | 915 dataLoc = allocateStackData(1); |
960 fRXPat->fFrameSize++; | 916 appendOp(URX_LOOP_C, dataLoc); |
961 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
962 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
963 break; | 917 break; |
964 } | 918 } |
965 } | 919 } |
966 | 920 |
967 // Emit general case code for this * | 921 // Emit general case code for this * |
968 // The optimizations did not apply. | 922 // The optimizations did not apply. |
969 | 923 |
970 int32_t saveStateLoc = blockTopLoc(TRUE); | 924 int32_t saveStateLoc = blockTopLoc(TRUE); |
971 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | 925 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); |
972 | 926 |
973 // Check for minimum match length of zero, which requires | 927 // Check for minimum match length of zero, which requires |
974 // extra loop-breaking code. | 928 // extra loop-breaking code. |
975 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { | 929 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { |
976 insertOp(saveStateLoc); | 930 insertOp(saveStateLoc); |
977 dataLoc = fRXPat->fFrameSize; | 931 dataLoc = allocateStackData(1); |
978 fRXPat->fFrameSize++; | |
979 | 932 |
980 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | 933 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); |
981 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | 934 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); |
982 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | 935 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); |
983 } | 936 } |
984 | 937 |
985 // Locate the position in the compiled pattern where the match will
continue | 938 // Locate the position in the compiled pattern where the match will
continue |
986 // after completing the *. (4 or 5 in the comment above) | 939 // after completing the *. (4 or 5 in the comment above) |
987 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 940 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
988 | 941 |
989 // Put together the save state op store it into the compiled code. | 942 // Put together the save state op and store it into the compiled cod
e. |
990 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | 943 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); |
991 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 944 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
992 | 945 |
993 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. | 946 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. |
994 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | 947 appendOp(jmpOp); |
995 } | 948 } |
996 break; | 949 break; |
997 | 950 |
998 case doNGStar: | 951 case doNGStar: |
999 // Non-greedy *? quantifier | 952 // Non-greedy *? quantifier |
1000 // compiles to | 953 // compiles to |
1001 // 1. JMP 3 | 954 // 1. JMP 3 |
1002 // 2. body of stuff being iterated over | 955 // 2. body of stuff being iterated over |
1003 // 3. STATE_SAVE 2 | 956 // 3. STATE_SAVE 2 |
1004 // 4 ... | 957 // 4 ... |
1005 { | 958 { |
1006 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. | 959 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. |
1007 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. | 960 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. |
1008 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); | 961 int32_t jmpOp = buildOp(URX_JMP, saveLoc); |
1009 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | |
1010 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | 962 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
1011 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); | 963 appendOp(URX_STATE_SAVE, jmpLoc+1); |
1012 } | 964 } |
1013 break; | 965 break; |
1014 | 966 |
1015 | 967 |
1016 case doIntervalInit: | 968 case doIntervalInit: |
1017 // The '{' opening an interval quantifier was just scanned. | 969 // The '{' opening an interval quantifier was just scanned. |
1018 // Init the counter varaiables that will accumulate the values as the di
gits | 970 // Init the counter varaiables that will accumulate the values as the di
gits |
1019 // are scanned. | 971 // are scanned. |
1020 fIntervalLow = 0; | 972 fIntervalLow = 0; |
1021 fIntervalUpper = -1; | 973 fIntervalUpper = -1; |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1070 int32_t topLoc = blockTopLoc(FALSE); | 1022 int32_t topLoc = blockTopLoc(FALSE); |
1071 | 1023 |
1072 // Produce normal looping code. | 1024 // Produce normal looping code. |
1073 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); | 1025 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); |
1074 | 1026 |
1075 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP | 1027 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP |
1076 // just as if the loop was inclosed in atomic parentheses. | 1028 // just as if the loop was inclosed in atomic parentheses. |
1077 | 1029 |
1078 // First the STO_SP before the start of the loop | 1030 // First the STO_SP before the start of the loop |
1079 insertOp(topLoc); | 1031 insertOp(topLoc); |
1080 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the | 1032 |
1081 fRXPat->fDataSize += 1; // state stack ptr. | 1033 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the |
1082 int32_t op = URX_BUILD(URX_STO_SP, varLoc); | 1034 int32_t op = buildOp(URX_STO_SP, varLoc); |
1083 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1035 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1084 | 1036 |
1085 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); | 1037 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); |
1086 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); | 1038 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); |
1087 loopOp++; // point LoopOp after the just-inserted STO_SP | 1039 loopOp++; // point LoopOp after the just-inserted STO_SP |
1088 fRXPat->fCompiledPat->push(loopOp, *fStatus); | 1040 fRXPat->fCompiledPat->push(loopOp, *fStatus); |
1089 | 1041 |
1090 // Then the LD_SP after the end of the loop | 1042 // Then the LD_SP after the end of the loop |
1091 op = URX_BUILD(URX_LD_SP, varLoc); | 1043 appendOp(URX_LD_SP, varLoc); |
1092 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1093 } | 1044 } |
1094 | 1045 |
1095 break; | 1046 break; |
1096 | 1047 |
1097 case doNGInterval: | 1048 case doNGInterval: |
1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. | 1049 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. |
1099 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); | 1050 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); |
1100 break; | 1051 break; |
1101 | 1052 |
1102 case doIntervalError: | 1053 case doIntervalError: |
(...skipping 15 matching lines...) Expand all Loading... |
1118 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 1069 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
1119 } | 1070 } |
1120 literalChar(fC.fChar); | 1071 literalChar(fC.fChar); |
1121 break; | 1072 break; |
1122 | 1073 |
1123 | 1074 |
1124 case doDotAny: | 1075 case doDotAny: |
1125 // scanned a ".", match any single character. | 1076 // scanned a ".", match any single character. |
1126 { | 1077 { |
1127 fixLiterals(FALSE); | 1078 fixLiterals(FALSE); |
1128 int32_t op; | |
1129 if (fModeFlags & UREGEX_DOTALL) { | 1079 if (fModeFlags & UREGEX_DOTALL) { |
1130 op = URX_BUILD(URX_DOTANY_ALL, 0); | 1080 appendOp(URX_DOTANY_ALL, 0); |
1131 } else if (fModeFlags & UREGEX_UNIX_LINES) { | 1081 } else if (fModeFlags & UREGEX_UNIX_LINES) { |
1132 op = URX_BUILD(URX_DOTANY_UNIX, 0); | 1082 appendOp(URX_DOTANY_UNIX, 0); |
1133 } else { | 1083 } else { |
1134 op = URX_BUILD(URX_DOTANY, 0); | 1084 appendOp(URX_DOTANY, 0); |
1135 } | 1085 } |
1136 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1137 } | 1086 } |
1138 break; | 1087 break; |
1139 | 1088 |
1140 case doCaret: | 1089 case doCaret: |
1141 { | 1090 { |
1142 fixLiterals(FALSE); | 1091 fixLiterals(FALSE); |
1143 int32_t op = 0; | |
1144 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1092 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1145 op = URX_CARET; | 1093 appendOp(URX_CARET, 0); |
1146 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1094 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1147 op = URX_CARET_M; | 1095 appendOp(URX_CARET_M, 0); |
1148 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1096 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1149 op = URX_CARET; // Only testing true start of input. | 1097 appendOp(URX_CARET, 0); // Only testing true start of input. |
1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1098 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1151 op = URX_CARET_M_UNIX; | 1099 appendOp(URX_CARET_M_UNIX, 0); |
1152 } | 1100 } |
1153 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
1154 } | 1101 } |
1155 break; | 1102 break; |
1156 | 1103 |
1157 case doDollar: | 1104 case doDollar: |
1158 { | 1105 { |
1159 fixLiterals(FALSE); | 1106 fixLiterals(FALSE); |
1160 int32_t op = 0; | |
1161 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1107 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1162 op = URX_DOLLAR; | 1108 appendOp(URX_DOLLAR, 0); |
1163 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1109 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1164 op = URX_DOLLAR_M; | 1110 appendOp(URX_DOLLAR_M, 0); |
1165 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1111 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1166 op = URX_DOLLAR_D; | 1112 appendOp(URX_DOLLAR_D, 0); |
1167 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1113 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1168 op = URX_DOLLAR_MD; | 1114 appendOp(URX_DOLLAR_MD, 0); |
1169 } | 1115 } |
1170 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
1171 } | 1116 } |
1172 break; | 1117 break; |
1173 | 1118 |
1174 case doBackslashA: | 1119 case doBackslashA: |
1175 fixLiterals(FALSE); | 1120 fixLiterals(FALSE); |
1176 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); | 1121 appendOp(URX_CARET, 0); |
1177 break; | 1122 break; |
1178 | 1123 |
1179 case doBackslashB: | 1124 case doBackslashB: |
1180 { | 1125 { |
1181 #if UCONFIG_NO_BREAK_ITERATION==1 | 1126 #if UCONFIG_NO_BREAK_ITERATION==1 |
1182 if (fModeFlags & UREGEX_UWORD) { | 1127 if (fModeFlags & UREGEX_UWORD) { |
1183 error(U_UNSUPPORTED_ERROR); | 1128 error(U_UNSUPPORTED_ERROR); |
1184 } | 1129 } |
1185 #endif | 1130 #endif |
1186 fixLiterals(FALSE); | 1131 fixLiterals(FALSE); |
1187 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1132 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
1188 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); | 1133 appendOp(op, 1); |
1189 } | 1134 } |
1190 break; | 1135 break; |
1191 | 1136 |
1192 case doBackslashb: | 1137 case doBackslashb: |
1193 { | 1138 { |
1194 #if UCONFIG_NO_BREAK_ITERATION==1 | 1139 #if UCONFIG_NO_BREAK_ITERATION==1 |
1195 if (fModeFlags & UREGEX_UWORD) { | 1140 if (fModeFlags & UREGEX_UWORD) { |
1196 error(U_UNSUPPORTED_ERROR); | 1141 error(U_UNSUPPORTED_ERROR); |
1197 } | 1142 } |
1198 #endif | 1143 #endif |
1199 fixLiterals(FALSE); | 1144 fixLiterals(FALSE); |
1200 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1145 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
1201 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | 1146 appendOp(op, 0); |
1202 } | 1147 } |
1203 break; | 1148 break; |
1204 | 1149 |
1205 case doBackslashD: | 1150 case doBackslashD: |
1206 fixLiterals(FALSE); | 1151 fixLiterals(FALSE); |
1207 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus
); | 1152 appendOp(URX_BACKSLASH_D, 1); |
1208 break; | 1153 break; |
1209 | 1154 |
1210 case doBackslashd: | 1155 case doBackslashd: |
1211 fixLiterals(FALSE); | 1156 fixLiterals(FALSE); |
1212 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus
); | 1157 appendOp(URX_BACKSLASH_D, 0); |
1213 break; | 1158 break; |
1214 | 1159 |
1215 case doBackslashG: | 1160 case doBackslashG: |
1216 fixLiterals(FALSE); | 1161 fixLiterals(FALSE); |
1217 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus
); | 1162 appendOp(URX_BACKSLASH_G, 0); |
1218 break; | 1163 break; |
1219 | 1164 |
1220 case doBackslashS: | 1165 case doBackslashS: |
1221 fixLiterals(FALSE); | 1166 fixLiterals(FALSE); |
1222 fRXPat->fCompiledPat->addElement( | 1167 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); |
1223 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); | |
1224 break; | 1168 break; |
1225 | 1169 |
1226 case doBackslashs: | 1170 case doBackslashs: |
1227 fixLiterals(FALSE); | 1171 fixLiterals(FALSE); |
1228 fRXPat->fCompiledPat->addElement( | 1172 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); |
1229 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); | |
1230 break; | 1173 break; |
1231 | 1174 |
1232 case doBackslashW: | 1175 case doBackslashW: |
1233 fixLiterals(FALSE); | 1176 fixLiterals(FALSE); |
1234 fRXPat->fCompiledPat->addElement( | 1177 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); |
1235 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); | |
1236 break; | 1178 break; |
1237 | 1179 |
1238 case doBackslashw: | 1180 case doBackslashw: |
1239 fixLiterals(FALSE); | 1181 fixLiterals(FALSE); |
1240 fRXPat->fCompiledPat->addElement( | 1182 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); |
1241 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); | |
1242 break; | 1183 break; |
1243 | 1184 |
1244 case doBackslashX: | 1185 case doBackslashX: |
1245 fixLiterals(FALSE); | 1186 fixLiterals(FALSE); |
1246 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus
); | 1187 appendOp(URX_BACKSLASH_X, 0); |
1247 break; | 1188 break; |
1248 | 1189 |
1249 | 1190 |
1250 case doBackslashZ: | 1191 case doBackslashZ: |
1251 fixLiterals(FALSE); | 1192 fixLiterals(FALSE); |
1252 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); | 1193 appendOp(URX_DOLLAR, 0); |
1253 break; | 1194 break; |
1254 | 1195 |
1255 case doBackslashz: | 1196 case doBackslashz: |
1256 fixLiterals(FALSE); | 1197 fixLiterals(FALSE); |
1257 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus
); | 1198 appendOp(URX_BACKSLASH_Z, 0); |
1258 break; | 1199 break; |
1259 | 1200 |
1260 case doEscapeError: | 1201 case doEscapeError: |
1261 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 1202 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
1262 break; | 1203 break; |
1263 | 1204 |
1264 case doExit: | 1205 case doExit: |
1265 fixLiterals(FALSE); | 1206 fixLiterals(FALSE); |
1266 returnVal = FALSE; | 1207 returnVal = FALSE; |
1267 break; | 1208 break; |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1307 } | 1248 } |
1308 | 1249 |
1309 // Scan of the back reference in the source regexp is complete. Now
generate | 1250 // Scan of the back reference in the source regexp is complete. Now
generate |
1310 // the compiled code for it. | 1251 // the compiled code for it. |
1311 // Because capture groups can be forward-referenced by back-referenc
es, | 1252 // Because capture groups can be forward-referenced by back-referenc
es, |
1312 // we fill the operand with the capture group number. At the end | 1253 // we fill the operand with the capture group number. At the end |
1313 // of compilation, it will be changed to the variable's location. | 1254 // of compilation, it will be changed to the variable's location. |
1314 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, | 1255 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, |
1315 // and shouldn't enter this code path at
all. | 1256 // and shouldn't enter this code path at
all. |
1316 fixLiterals(FALSE); | 1257 fixLiterals(FALSE); |
1317 int32_t op; | |
1318 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1258 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1319 op = URX_BUILD(URX_BACKREF_I, groupNum); | 1259 appendOp(URX_BACKREF_I, groupNum); |
1320 } else { | 1260 } else { |
1321 op = URX_BUILD(URX_BACKREF, groupNum); | 1261 appendOp(URX_BACKREF, groupNum); |
1322 } | 1262 } |
1323 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1324 } | 1263 } |
1325 break; | 1264 break; |
1326 | 1265 |
1327 | 1266 |
1328 case doPossessivePlus: | 1267 case doPossessivePlus: |
1329 // Possessive ++ quantifier. | 1268 // Possessive ++ quantifier. |
1330 // Compiles to | 1269 // Compiles to |
1331 // 1. STO_SP | 1270 // 1. STO_SP |
1332 // 2. body of stuff being iterated over | 1271 // 2. body of stuff being iterated over |
1333 // 3. STATE_SAVE 5 | 1272 // 3. STATE_SAVE 5 |
1334 // 4. JMP 2 | 1273 // 4. JMP 2 |
1335 // 5. LD_SP | 1274 // 5. LD_SP |
1336 // 6. ... | 1275 // 6. ... |
1337 // | 1276 // |
1338 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up | 1277 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up |
1339 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 | 1278 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 |
1340 // | 1279 // |
1341 { | 1280 { |
1342 // Emit the STO_SP | 1281 // Emit the STO_SP |
1343 int32_t topLoc = blockTopLoc(TRUE); | 1282 int32_t topLoc = blockTopLoc(TRUE); |
1344 int32_t stoLoc = fRXPat->fDataSize; | 1283 int32_t stoLoc = allocateData(1); // Reserve the data location fo
r storing save stack ptr. |
1345 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | 1284 int32_t op = buildOp(URX_STO_SP, stoLoc); |
1346 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
1347 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1285 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1348 | 1286 |
1349 // Emit the STATE_SAVE | 1287 // Emit the STATE_SAVE |
1350 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | 1288 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
1351 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1352 | 1289 |
1353 // Emit the JMP | 1290 // Emit the JMP |
1354 op = URX_BUILD(URX_JMP, topLoc+1); | 1291 appendOp(URX_JMP, topLoc+1); |
1355 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1356 | 1292 |
1357 // Emit the LD_SP | 1293 // Emit the LD_SP |
1358 op = URX_BUILD(URX_LD_SP, stoLoc); | 1294 appendOp(URX_LD_SP, stoLoc); |
1359 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1360 } | 1295 } |
1361 break; | 1296 break; |
1362 | 1297 |
1363 case doPossessiveStar: | 1298 case doPossessiveStar: |
1364 // Possessive *+ quantifier. | 1299 // Possessive *+ quantifier. |
1365 // Compiles to | 1300 // Compiles to |
1366 // 1. STO_SP loc | 1301 // 1. STO_SP loc |
1367 // 2. STATE_SAVE 5 | 1302 // 2. STATE_SAVE 5 |
1368 // 3. body of stuff being iterated over | 1303 // 3. body of stuff being iterated over |
1369 // 4. JMP 2 | 1304 // 4. JMP 2 |
1370 // 5. LD_SP loc | 1305 // 5. LD_SP loc |
1371 // 6 ... | 1306 // 6 ... |
1372 // TODO: do something to cut back the state stack each time through the
loop. | 1307 // TODO: do something to cut back the state stack each time through the
loop. |
1373 { | 1308 { |
1374 // Reserve two slots at the top of the block. | 1309 // Reserve two slots at the top of the block. |
1375 int32_t topLoc = blockTopLoc(TRUE); | 1310 int32_t topLoc = blockTopLoc(TRUE); |
1376 insertOp(topLoc); | 1311 insertOp(topLoc); |
1377 | 1312 |
1378 // emit STO_SP loc | 1313 // emit STO_SP loc |
1379 int32_t stoLoc = fRXPat->fDataSize; | 1314 int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. |
1380 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | 1315 int32_t op = buildOp(URX_STO_SP, stoLoc); |
1381 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
1382 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1316 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1383 | 1317 |
1384 // Emit the SAVE_STATE 5 | 1318 // Emit the SAVE_STATE 5 |
1385 int32_t L7 = fRXPat->fCompiledPat->size()+1; | 1319 int32_t L7 = fRXPat->fCompiledPat->size()+1; |
1386 op = URX_BUILD(URX_STATE_SAVE, L7); | 1320 op = buildOp(URX_STATE_SAVE, L7); |
1387 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1321 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
1388 | 1322 |
1389 // Append the JMP operation. | 1323 // Append the JMP operation. |
1390 op = URX_BUILD(URX_JMP, topLoc+1); | 1324 appendOp(URX_JMP, topLoc+1); |
1391 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1392 | 1325 |
1393 // Emit the LD_SP loc | 1326 // Emit the LD_SP loc |
1394 op = URX_BUILD(URX_LD_SP, stoLoc); | 1327 appendOp(URX_LD_SP, stoLoc); |
1395 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1396 } | 1328 } |
1397 break; | 1329 break; |
1398 | 1330 |
1399 case doPossessiveOpt: | 1331 case doPossessiveOpt: |
1400 // Possessive ?+ quantifier. | 1332 // Possessive ?+ quantifier. |
1401 // Compiles to | 1333 // Compiles to |
1402 // 1. STO_SP loc | 1334 // 1. STO_SP loc |
1403 // 2. SAVE_STATE 5 | 1335 // 2. SAVE_STATE 5 |
1404 // 3. body of optional block | 1336 // 3. body of optional block |
1405 // 4. LD_SP loc | 1337 // 4. LD_SP loc |
1406 // 5. ... | 1338 // 5. ... |
1407 // | 1339 // |
1408 { | 1340 { |
1409 // Reserve two slots at the top of the block. | 1341 // Reserve two slots at the top of the block. |
1410 int32_t topLoc = blockTopLoc(TRUE); | 1342 int32_t topLoc = blockTopLoc(TRUE); |
1411 insertOp(topLoc); | 1343 insertOp(topLoc); |
1412 | 1344 |
1413 // Emit the STO_SP | 1345 // Emit the STO_SP |
1414 int32_t stoLoc = fRXPat->fDataSize; | 1346 int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. |
1415 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | 1347 int32_t op = buildOp(URX_STO_SP, stoLoc); |
1416 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
1417 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1348 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1418 | 1349 |
1419 // Emit the SAVE_STATE | 1350 // Emit the SAVE_STATE |
1420 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 1351 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
1421 op = URX_BUILD(URX_STATE_SAVE, continueLoc); | 1352 op = buildOp(URX_STATE_SAVE, continueLoc); |
1422 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1353 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
1423 | 1354 |
1424 // Emit the LD_SP | 1355 // Emit the LD_SP |
1425 op = URX_BUILD(URX_LD_SP, stoLoc); | 1356 appendOp(URX_LD_SP, stoLoc); |
1426 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1427 } | 1357 } |
1428 break; | 1358 break; |
1429 | 1359 |
1430 | 1360 |
1431 case doBeginMatchMode: | 1361 case doBeginMatchMode: |
1432 fNewModeFlags = fModeFlags; | 1362 fNewModeFlags = fModeFlags; |
1433 fSetModeFlag = TRUE; | 1363 fSetModeFlag = TRUE; |
1434 break; | 1364 break; |
1435 | 1365 |
1436 case doMatchMode: // (?i) and similar | 1366 case doMatchMode: // (?i) and similar |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1473 // We've got a (?i: or similar. Begin a parenthesized block, save old | 1403 // We've got a (?i: or similar. Begin a parenthesized block, save old |
1474 // mode flags so they can be restored at the close of the block. | 1404 // mode flags so they can be restored at the close of the block. |
1475 // | 1405 // |
1476 // Compile to a | 1406 // Compile to a |
1477 // - NOP, which later may be replaced by a save-state if the | 1407 // - NOP, which later may be replaced by a save-state if the |
1478 // parenthesized group gets a * quantifier, followed by | 1408 // parenthesized group gets a * quantifier, followed by |
1479 // - NOP, which may later be replaced by a save-state if there | 1409 // - NOP, which may later be replaced by a save-state if there |
1480 // is an '|' alternation within the parens. | 1410 // is an '|' alternation within the parens. |
1481 { | 1411 { |
1482 fixLiterals(FALSE); | 1412 fixLiterals(FALSE); |
1483 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 1413 appendOp(URX_NOP, 0); |
1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | 1414 appendOp(URX_NOP, 0); |
1485 | 1415 |
1486 // On the Parentheses stack, start a new frame and add the postions | 1416 // On the Parentheses stack, start a new frame and add the postions |
1487 // of the two NOPs (a normal non-capturing () frame, except for th
e | 1417 // of the two NOPs (a normal non-capturing () frame, except for th
e |
1488 // saving of the orignal mode flags.) | 1418 // saving of the orignal mode flags.) |
1489 fParenStack.push(fModeFlags, *fStatus); | 1419 fParenStack.push(fModeFlags, *fStatus); |
1490 fParenStack.push(flags, *fStatus); // Fra
me Marker | 1420 fParenStack.push(flags, *fStatus); // Fra
me Marker |
1491 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP | 1421 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP |
1492 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 1422 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
1493 | 1423 |
1494 // Set the current mode flags to the new values. | 1424 // Set the current mode flags to the new values. |
(...skipping 316 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1811 // string in a pattern, emit the code to match the | 1741 // string in a pattern, emit the code to match the |
1812 // accumulated literal string. | 1742 // accumulated literal string. |
1813 // | 1743 // |
1814 // Optionally, split the last char of the string off in
to | 1744 // Optionally, split the last char of the string off in
to |
1815 // a single "ONE_CHAR" operation, so that quantifiers c
an | 1745 // a single "ONE_CHAR" operation, so that quantifiers c
an |
1816 // apply to that char alone. Example: abc* | 1746 // apply to that char alone. Example: abc* |
1817 // The * must apply to the 'c' only. | 1747 // The * must apply to the 'c' only. |
1818 // | 1748 // |
1819 //------------------------------------------------------------------------------ | 1749 //------------------------------------------------------------------------------ |
1820 void RegexCompile::fixLiterals(UBool split) { | 1750 void RegexCompile::fixLiterals(UBool split) { |
1821 int32_t op = 0; // An op from/for the compiled patter
n. | |
1822 | 1751 |
1823 // If no literal characters have been scanned but not yet had code generated | 1752 // If no literal characters have been scanned but not yet had code generated |
1824 // for them, nothing needs to be done. | 1753 // for them, nothing needs to be done. |
1825 if (fLiteralChars.length() == 0) { | 1754 if (fLiteralChars.length() == 0) { |
1826 return; | 1755 return; |
1827 } | 1756 } |
1828 | 1757 |
1829 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); | 1758 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); |
1830 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1759 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
1831 | 1760 |
(...skipping 18 matching lines...) Expand all Loading... |
1850 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1779 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1851 fLiteralChars.foldCase(); | 1780 fLiteralChars.foldCase(); |
1852 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); | 1781 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); |
1853 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1782 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
1854 } | 1783 } |
1855 | 1784 |
1856 if (indexOfLastCodePoint == 0) { | 1785 if (indexOfLastCodePoint == 0) { |
1857 // Single character, emit a URX_ONECHAR op to match it. | 1786 // Single character, emit a URX_ONECHAR op to match it. |
1858 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | 1787 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && |
1859 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { | 1788 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { |
1860 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); | 1789 appendOp(URX_ONECHAR_I, lastCodePoint); |
1861 } else { | 1790 } else { |
1862 op = URX_BUILD(URX_ONECHAR, lastCodePoint); | 1791 appendOp(URX_ONECHAR, lastCodePoint); |
1863 } | 1792 } |
1864 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1865 } else { | 1793 } else { |
1866 // Two or more chars, emit a URX_STRING to match them. | 1794 // Two or more chars, emit a URX_STRING to match them. |
| 1795 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length()
> 0x00ffffff) { |
| 1796 error(U_REGEX_PATTERN_TOO_BIG); |
| 1797 } |
1867 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1798 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1868 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); | 1799 appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); |
1869 } else { | 1800 } else { |
1870 // TODO here: add optimization to split case sensitive strings of l
ength two | 1801 // TODO here: add optimization to split case sensitive strings of l
ength two |
1871 // into two single char ops, for efficiency. | 1802 // into two single char ops, for efficiency. |
1872 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); | 1803 appendOp(URX_STRING, fRXPat->fLiteralText.length()); |
1873 } | 1804 } |
1874 fRXPat->fCompiledPat->addElement(op, *fStatus); | 1805 appendOp(URX_STRING_LEN, fLiteralChars.length()); |
1875 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); | |
1876 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1877 | 1806 |
1878 // Add this string into the accumulated strings of the compiled pattern. | 1807 // Add this string into the accumulated strings of the compiled pattern. |
1879 fRXPat->fLiteralText.append(fLiteralChars); | 1808 fRXPat->fLiteralText.append(fLiteralChars); |
1880 } | 1809 } |
1881 | 1810 |
1882 fLiteralChars.remove(); | 1811 fLiteralChars.remove(); |
1883 } | 1812 } |
1884 | 1813 |
1885 | 1814 |
1886 | 1815 int32_t RegexCompile::buildOp(int32_t type, int32_t val) { |
1887 | 1816 if (U_FAILURE(*fStatus)) { |
| 1817 return 0; |
| 1818 } |
| 1819 if (type < 0 || type > 255) { |
| 1820 U_ASSERT(FALSE); |
| 1821 error(U_REGEX_INTERNAL_ERROR); |
| 1822 type = URX_RESERVED_OP; |
| 1823 } |
| 1824 if (val > 0x00ffffff) { |
| 1825 U_ASSERT(FALSE); |
| 1826 error(U_REGEX_INTERNAL_ERROR); |
| 1827 val = 0; |
| 1828 } |
| 1829 if (val < 0) { |
| 1830 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { |
| 1831 U_ASSERT(FALSE); |
| 1832 error(U_REGEX_INTERNAL_ERROR); |
| 1833 return -1; |
| 1834 } |
| 1835 if (URX_TYPE(val) != 0xff) { |
| 1836 U_ASSERT(FALSE); |
| 1837 error(U_REGEX_INTERNAL_ERROR); |
| 1838 return -1; |
| 1839 } |
| 1840 type = URX_RESERVED_OP_N; |
| 1841 } |
| 1842 return (type << 24) | val; |
| 1843 } |
1888 | 1844 |
1889 | 1845 |
1890 //------------------------------------------------------------------------------ | 1846 //------------------------------------------------------------------------------ |
| 1847 // |
| 1848 // appendOp() Append a new instruction onto the compiled pattern |
| 1849 // Includes error checking, limiting the size of the |
| 1850 // pattern to lengths that can be represented in the |
| 1851 // 24 bit operand field of an instruction. |
| 1852 // |
| 1853 //------------------------------------------------------------------------------ |
| 1854 void RegexCompile::appendOp(int32_t op) { |
| 1855 if (U_FAILURE(*fStatus)) { |
| 1856 return; |
| 1857 } |
| 1858 fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1859 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { |
| 1860 error(U_REGEX_PATTERN_TOO_BIG); |
| 1861 } |
| 1862 } |
| 1863 |
| 1864 void RegexCompile::appendOp(int32_t type, int32_t val) { |
| 1865 appendOp(buildOp(type, val)); |
| 1866 } |
| 1867 |
| 1868 |
| 1869 //------------------------------------------------------------------------------ |
1891 // | 1870 // |
1892 // insertOp() Insert a slot for a new opcode into the already | 1871 // insertOp() Insert a slot for a new opcode into the already |
1893 // compiled pattern code. | 1872 // compiled pattern code. |
1894 // | 1873 // |
1895 // Fill the slot with a NOP. Our caller will replace i
t | 1874 // Fill the slot with a NOP. Our caller will replace i
t |
1896 // with what they really wanted. | 1875 // with what they really wanted. |
1897 // | 1876 // |
1898 //------------------------------------------------------------------------------ | 1877 //------------------------------------------------------------------------------ |
1899 void RegexCompile::insertOp(int32_t where) { | 1878 void RegexCompile::insertOp(int32_t where) { |
1900 UVector64 *code = fRXPat->fCompiledPat; | 1879 UVector64 *code = fRXPat->fCompiledPat; |
1901 U_ASSERT(where>0 && where < code->size()); | 1880 U_ASSERT(where>0 && where < code->size()); |
1902 | 1881 |
1903 int32_t nop = URX_BUILD(URX_NOP, 0); | 1882 int32_t nop = buildOp(URX_NOP, 0); |
1904 code->insertElementAt(nop, where, *fStatus); | 1883 code->insertElementAt(nop, where, *fStatus); |
1905 | 1884 |
1906 // Walk through the pattern, looking for any ops with targets that | 1885 // Walk through the pattern, looking for any ops with targets that |
1907 // were moved down by the insert. Fix them. | 1886 // were moved down by the insert. Fix them. |
1908 int32_t loc; | 1887 int32_t loc; |
1909 for (loc=0; loc<code->size(); loc++) { | 1888 for (loc=0; loc<code->size(); loc++) { |
1910 int32_t op = (int32_t)code->elementAti(loc); | 1889 int32_t op = (int32_t)code->elementAti(loc); |
1911 int32_t opType = URX_TYPE(op); | 1890 int32_t opType = URX_TYPE(op); |
1912 int32_t opValue = URX_VAL(op); | 1891 int32_t opValue = URX_VAL(op); |
1913 if ((opType == URX_JMP || | 1892 if ((opType == URX_JMP || |
1914 opType == URX_JMPX || | 1893 opType == URX_JMPX || |
1915 opType == URX_STATE_SAVE || | 1894 opType == URX_STATE_SAVE || |
1916 opType == URX_CTR_LOOP || | 1895 opType == URX_CTR_LOOP || |
1917 opType == URX_CTR_LOOP_NG || | 1896 opType == URX_CTR_LOOP_NG || |
1918 opType == URX_JMP_SAV || | 1897 opType == URX_JMP_SAV || |
1919 opType == URX_JMP_SAV_X || | 1898 opType == URX_JMP_SAV_X || |
1920 opType == URX_RELOC_OPRND) && opValue > where) { | 1899 opType == URX_RELOC_OPRND) && opValue > where) { |
1921 // Target location for this opcode is after the insertion point and | 1900 // Target location for this opcode is after the insertion point and |
1922 // needs to be incremented to adjust for the insertion. | 1901 // needs to be incremented to adjust for the insertion. |
1923 opValue++; | 1902 opValue++; |
1924 op = URX_BUILD(opType, opValue); | 1903 op = buildOp(opType, opValue); |
1925 code->setElementAt(op, loc); | 1904 code->setElementAt(op, loc); |
1926 } | 1905 } |
1927 } | 1906 } |
1928 | 1907 |
1929 // Now fix up the parentheses stack. All positive values in it are location
s in | 1908 // Now fix up the parentheses stack. All positive values in it are location
s in |
1930 // the compiled pattern. (Negative values are frame boundaries, and don't
need fixing.) | 1909 // the compiled pattern. (Negative values are frame boundaries, and don't
need fixing.) |
1931 for (loc=0; loc<fParenStack.size(); loc++) { | 1910 for (loc=0; loc<fParenStack.size(); loc++) { |
1932 int32_t x = fParenStack.elementAti(loc); | 1911 int32_t x = fParenStack.elementAti(loc); |
1933 U_ASSERT(x < code->size()); | 1912 U_ASSERT(x < code->size()); |
1934 if (x>where) { | 1913 if (x>where) { |
1935 x++; | 1914 x++; |
1936 fParenStack.setElementAt(x, loc); | 1915 fParenStack.setElementAt(x, loc); |
1937 } | 1916 } |
1938 } | 1917 } |
1939 | 1918 |
1940 if (fMatchCloseParen > where) { | 1919 if (fMatchCloseParen > where) { |
1941 fMatchCloseParen++; | 1920 fMatchCloseParen++; |
1942 } | 1921 } |
1943 if (fMatchOpenParen > where) { | 1922 if (fMatchOpenParen > where) { |
1944 fMatchOpenParen++; | 1923 fMatchOpenParen++; |
1945 } | 1924 } |
1946 } | 1925 } |
1947 | 1926 |
1948 | 1927 |
1949 | |
1950 //------------------------------------------------------------------------------ | 1928 //------------------------------------------------------------------------------ |
1951 // | 1929 // |
| 1930 // allocateData() Allocate storage in the matcher's static data area. |
| 1931 // Return the index for the newly allocated data. |
| 1932 // The storage won't actually exist until we are running
a match |
| 1933 // operation, but the storage indexes are inserted into
various |
| 1934 // opcodes while compiling the pattern. |
| 1935 // |
| 1936 //------------------------------------------------------------------------------ |
| 1937 int32_t RegexCompile::allocateData(int32_t size) { |
| 1938 if (U_FAILURE(*fStatus)) { |
| 1939 return 0; |
| 1940 } |
| 1941 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { |
| 1942 error(U_REGEX_INTERNAL_ERROR); |
| 1943 return 0; |
| 1944 } |
| 1945 int32_t dataIndex = fRXPat->fDataSize; |
| 1946 fRXPat->fDataSize += size; |
| 1947 if (fRXPat->fDataSize >= 0x00fffff0) { |
| 1948 error(U_REGEX_INTERNAL_ERROR); |
| 1949 } |
| 1950 return dataIndex; |
| 1951 } |
| 1952 |
| 1953 |
| 1954 //------------------------------------------------------------------------------ |
| 1955 // |
| 1956 // allocateStackData() Allocate space in the back-tracking stack frame. |
| 1957 // Return the index for the newly allocated data. |
| 1958 // The frame indexes are inserted into various |
| 1959 // opcodes while compiling the pattern, meaning that fra
me |
| 1960 // size must be restricted to the size that will fit |
| 1961 // as an operand (24 bits). |
| 1962 // |
| 1963 //------------------------------------------------------------------------------ |
| 1964 int32_t RegexCompile::allocateStackData(int32_t size) { |
| 1965 if (U_FAILURE(*fStatus)) { |
| 1966 return 0; |
| 1967 } |
| 1968 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { |
| 1969 error(U_REGEX_INTERNAL_ERROR); |
| 1970 return 0; |
| 1971 } |
| 1972 int32_t dataIndex = fRXPat->fFrameSize; |
| 1973 fRXPat->fFrameSize += size; |
| 1974 if (fRXPat->fFrameSize >= 0x00fffff0) { |
| 1975 error(U_REGEX_PATTERN_TOO_BIG); |
| 1976 } |
| 1977 return dataIndex; |
| 1978 } |
| 1979 |
| 1980 |
| 1981 //------------------------------------------------------------------------------ |
| 1982 // |
1952 // blockTopLoc() Find or create a location in the compiled pattern | 1983 // blockTopLoc() Find or create a location in the compiled pattern |
1953 // at the start of the operation or block that has | 1984 // at the start of the operation or block that has |
1954 // just been compiled. Needed when a quantifier (* or | 1985 // just been compiled. Needed when a quantifier (* or |
1955 // whatever) appears, and we need to add an operation | 1986 // whatever) appears, and we need to add an operation |
1956 // at the start of the thing being quantified. | 1987 // at the start of the thing being quantified. |
1957 // | 1988 // |
1958 // (Parenthesized Blocks) have a slot with a NOP that | 1989 // (Parenthesized Blocks) have a slot with a NOP that |
1959 // is reserved for this purpose. .* or similar don't | 1990 // is reserved for this purpose. .* or similar don't |
1960 // and a slot needs to be added. | 1991 // and a slot needs to be added. |
1961 // | 1992 // |
(...skipping 19 matching lines...) Expand all Loading... |
1981 // No slot for STATE_SAVE was pre-reserved in the compiled code. | 2012 // No slot for STATE_SAVE was pre-reserved in the compiled code. |
1982 // We need to make space now. | 2013 // We need to make space now. |
1983 theLoc = fRXPat->fCompiledPat->size()-1; | 2014 theLoc = fRXPat->fCompiledPat->size()-1; |
1984 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); | 2015 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); |
1985 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { | 2016 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { |
1986 // Strings take two opcode, we want the position of the first one. | 2017 // Strings take two opcode, we want the position of the first one. |
1987 // We can have a string at this point if a single character case-fol
ded to two. | 2018 // We can have a string at this point if a single character case-fol
ded to two. |
1988 theLoc--; | 2019 theLoc--; |
1989 } | 2020 } |
1990 if (reserveLoc) { | 2021 if (reserveLoc) { |
1991 int32_t nop = URX_BUILD(URX_NOP, 0); | 2022 int32_t nop = buildOp(URX_NOP, 0); |
1992 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); | 2023 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); |
1993 } | 2024 } |
1994 } | 2025 } |
1995 return theLoc; | 2026 return theLoc; |
1996 } | 2027 } |
1997 | 2028 |
1998 | 2029 |
1999 | 2030 |
2000 //------------------------------------------------------------------------------ | 2031 //------------------------------------------------------------------------------ |
2001 // | 2032 // |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2056 case capturing: | 2087 case capturing: |
2057 // Capturing Parentheses. | 2088 // Capturing Parentheses. |
2058 // Insert a End Capture op into the pattern. | 2089 // Insert a End Capture op into the pattern. |
2059 // The frame offset of the variables for this cg is obtained from the | 2090 // The frame offset of the variables for this cg is obtained from the |
2060 // start capture op and put it into the end-capture op. | 2091 // start capture op and put it into the end-capture op. |
2061 { | 2092 { |
2062 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); | 2093 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); |
2063 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | 2094 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
2064 | 2095 |
2065 int32_t frameVarLocation = URX_VAL(captureOp); | 2096 int32_t frameVarLocation = URX_VAL(captureOp); |
2066 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); | 2097 appendOp(URX_END_CAPTURE, frameVarLocation); |
2067 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); | |
2068 } | 2098 } |
2069 break; | 2099 break; |
2070 case atomic: | 2100 case atomic: |
2071 // Atomic Parenthesis. | 2101 // Atomic Parenthesis. |
2072 // Insert a LD_SP operation to restore the state stack to the position | 2102 // Insert a LD_SP operation to restore the state stack to the position |
2073 // it was when the atomic parens were entered. | 2103 // it was when the atomic parens were entered. |
2074 { | 2104 { |
2075 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); | 2105 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); |
2076 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | 2106 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); |
2077 int32_t stoLoc = URX_VAL(stoOp); | 2107 int32_t stoLoc = URX_VAL(stoOp); |
2078 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); | 2108 appendOp(URX_LD_SP, stoLoc); |
2079 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); | |
2080 } | 2109 } |
2081 break; | 2110 break; |
2082 | 2111 |
2083 case lookAhead: | 2112 case lookAhead: |
2084 { | 2113 { |
2085 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2114 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
2086 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2115 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
2087 int32_t dataLoc = URX_VAL(startOp); | 2116 int32_t dataLoc = URX_VAL(startOp); |
2088 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2117 appendOp(URX_LA_END, dataLoc); |
2089 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2090 } | 2118 } |
2091 break; | 2119 break; |
2092 | 2120 |
2093 case negLookAhead: | 2121 case negLookAhead: |
2094 { | 2122 { |
2095 // See comment at doOpenLookAheadNeg | 2123 // See comment at doOpenLookAheadNeg |
2096 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); | 2124 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); |
2097 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2125 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
2098 int32_t dataLoc = URX_VAL(startOp); | 2126 int32_t dataLoc = URX_VAL(startOp); |
2099 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2127 appendOp(URX_LA_END, dataLoc); |
2100 fRXPat->fCompiledPat->addElement(op, *fStatus); | 2128 appendOp(URX_BACKTRACK, 0); |
2101 op = URX_BUILD(URX_BACKTRACK, 0); | 2129 appendOp(URX_LA_END, dataLoc); |
2102 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2103 op = URX_BUILD(URX_LA_END, dataLoc); | |
2104 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2105 | 2130 |
2106 // Patch the URX_SAVE near the top of the block. | 2131 // Patch the URX_SAVE near the top of the block. |
2107 // The destination of the SAVE is the final LA_END that was just add
ed. | 2132 // The destination of the SAVE is the final LA_END that was just add
ed. |
2108 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); | 2133 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); |
2109 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | 2134 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); |
2110 int32_t dest = fRXPat->fCompiledPat->size()-1; | 2135 int32_t dest = fRXPat->fCompiledPat->size()-1; |
2111 saveOp = URX_BUILD(URX_STATE_SAVE, dest); | 2136 saveOp = buildOp(URX_STATE_SAVE, dest); |
2112 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | 2137 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); |
2113 } | 2138 } |
2114 break; | 2139 break; |
2115 | 2140 |
2116 case lookBehind: | 2141 case lookBehind: |
2117 { | 2142 { |
2118 // See comment at doOpenLookBehind. | 2143 // See comment at doOpenLookBehind. |
2119 | 2144 |
2120 // Append the URX_LB_END and URX_LA_END to the compiled pattern. | 2145 // Append the URX_LB_END and URX_LA_END to the compiled pattern. |
2121 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); | 2146 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); |
2122 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2147 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
2123 int32_t dataLoc = URX_VAL(startOp); | 2148 int32_t dataLoc = URX_VAL(startOp); |
2124 int32_t op = URX_BUILD(URX_LB_END, dataLoc); | 2149 appendOp(URX_LB_END, dataLoc); |
2125 fRXPat->fCompiledPat->addElement(op, *fStatus); | 2150 appendOp(URX_LA_END, dataLoc); |
2126 op = URX_BUILD(URX_LA_END, dataLoc); | |
2127 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2128 | 2151 |
2129 // Determine the min and max bounds for the length of the | 2152 // Determine the min and max bounds for the length of the |
2130 // string that the pattern can match. | 2153 // string that the pattern can match. |
2131 // An unbounded upper limit is an error. | 2154 // An unbounded upper limit is an error. |
2132 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2155 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
2133 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2156 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
2134 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2157 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2158 if (URX_TYPE(maxML) != 0) { |
| 2159 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2160 break; |
| 2161 } |
2135 if (maxML == INT32_MAX) { | 2162 if (maxML == INT32_MAX) { |
2136 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2163 error(U_REGEX_LOOK_BEHIND_LIMIT); |
2137 break; | 2164 break; |
2138 } | 2165 } |
2139 U_ASSERT(minML <= maxML); | 2166 U_ASSERT(minML <= maxML); |
2140 | 2167 |
2141 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2168 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
2142 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2169 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
2143 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); | 2170 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); |
2144 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); | 2171 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); |
2145 | 2172 |
2146 } | 2173 } |
2147 break; | 2174 break; |
2148 | 2175 |
2149 | 2176 |
2150 | 2177 |
2151 case lookBehindN: | 2178 case lookBehindN: |
2152 { | 2179 { |
2153 // See comment at doOpenLookBehindNeg. | 2180 // See comment at doOpenLookBehindNeg. |
2154 | 2181 |
2155 // Append the URX_LBN_END to the compiled pattern. | 2182 // Append the URX_LBN_END to the compiled pattern. |
2156 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2183 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
2157 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2184 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
2158 int32_t dataLoc = URX_VAL(startOp); | 2185 int32_t dataLoc = URX_VAL(startOp); |
2159 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); | 2186 appendOp(URX_LBN_END, dataLoc); |
2160 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2161 | 2187 |
2162 // Determine the min and max bounds for the length of the | 2188 // Determine the min and max bounds for the length of the |
2163 // string that the pattern can match. | 2189 // string that the pattern can match. |
2164 // An unbounded upper limit is an error. | 2190 // An unbounded upper limit is an error. |
2165 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2191 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
2166 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2192 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
2167 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2193 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
| 2194 if (URX_TYPE(maxML) != 0) { |
| 2195 error(U_REGEX_LOOK_BEHIND_LIMIT); |
| 2196 break; |
| 2197 } |
2168 if (maxML == INT32_MAX) { | 2198 if (maxML == INT32_MAX) { |
2169 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2199 error(U_REGEX_LOOK_BEHIND_LIMIT); |
2170 break; | 2200 break; |
2171 } | 2201 } |
2172 U_ASSERT(minML <= maxML); | 2202 U_ASSERT(minML <= maxML); |
2173 | 2203 |
2174 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2204 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
2175 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2205 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
2176 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); | 2206 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); |
2177 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); | 2207 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); |
2178 | 2208 |
2179 // Insert the pattern location to continue at after a successful mat
ch | 2209 // Insert the pattern location to continue at after a successful mat
ch |
2180 // as the last operand of the URX_LBN_CONT | 2210 // as the last operand of the URX_LBN_CONT |
2181 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); | 2211 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); |
2182 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); | 2212 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); |
2183 } | 2213 } |
2184 break; | 2214 break; |
2185 | 2215 |
2186 | 2216 |
2187 | 2217 |
2188 default: | 2218 default: |
2189 U_ASSERT(FALSE); | 2219 U_ASSERT(FALSE); |
2190 } | 2220 } |
2191 | 2221 |
(...skipping 20 matching lines...) Expand all Loading... |
2212 // There shoudn't be any, but just in case. | 2242 // There shoudn't be any, but just in case. |
2213 // (Case Closure can add them; if we had a simple case closure avaialble
that | 2243 // (Case Closure can add them; if we had a simple case closure avaialble
that |
2214 // ignored strings, that would be better.) | 2244 // ignored strings, that would be better.) |
2215 theSet->removeAllStrings(); | 2245 theSet->removeAllStrings(); |
2216 int32_t setSize = theSet->size(); | 2246 int32_t setSize = theSet->size(); |
2217 | 2247 |
2218 switch (setSize) { | 2248 switch (setSize) { |
2219 case 0: | 2249 case 0: |
2220 { | 2250 { |
2221 // Set of no elements. Always fails to match. | 2251 // Set of no elements. Always fails to match. |
2222 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStat
us); | 2252 appendOp(URX_BACKTRACK, 0); |
2223 delete theSet; | 2253 delete theSet; |
2224 } | 2254 } |
2225 break; | 2255 break; |
2226 | 2256 |
2227 case 1: | 2257 case 1: |
2228 { | 2258 { |
2229 // The set contains only a single code point. Put it into | 2259 // The set contains only a single code point. Put it into |
2230 // the compiled pattern as a single char operation rather | 2260 // the compiled pattern as a single char operation rather |
2231 // than a set, and discard the set itself. | 2261 // than a set, and discard the set itself. |
2232 literalChar(theSet->charAt(0)); | 2262 literalChar(theSet->charAt(0)); |
2233 delete theSet; | 2263 delete theSet; |
2234 } | 2264 } |
2235 break; | 2265 break; |
2236 | 2266 |
2237 default: | 2267 default: |
2238 { | 2268 { |
2239 // The set contains two or more chars. (the normal case) | 2269 // The set contains two or more chars. (the normal case) |
2240 // Put it into the compiled pattern as a set. | 2270 // Put it into the compiled pattern as a set. |
2241 int32_t setNumber = fRXPat->fSets->size(); | 2271 int32_t setNumber = fRXPat->fSets->size(); |
2242 fRXPat->fSets->addElement(theSet, *fStatus); | 2272 fRXPat->fSets->addElement(theSet, *fStatus); |
2243 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | 2273 appendOp(URX_SETREF, setNumber); |
2244 fRXPat->fCompiledPat->addElement(setOp, *fStatus); | |
2245 } | 2274 } |
2246 } | 2275 } |
2247 } | 2276 } |
2248 | 2277 |
2249 | 2278 |
2250 //------------------------------------------------------------------------------ | 2279 //------------------------------------------------------------------------------ |
2251 // | 2280 // |
2252 // compileInterval Generate the code for a {min, max} style interval quanti
fier. | 2281 // compileInterval Generate the code for a {min, max} style interval quanti
fier. |
2253 // Except for the specific opcodes used, the code is the sa
me | 2282 // Except for the specific opcodes used, the code is the sa
me |
2254 // for all three types (greedy, non-greedy, possessive) of | 2283 // for all three types (greedy, non-greedy, possessive) of |
(...skipping 18 matching lines...) Expand all Loading... |
2273 int32_t topOfBlock = blockTopLoc(TRUE); | 2302 int32_t topOfBlock = blockTopLoc(TRUE); |
2274 insertOp(topOfBlock); | 2303 insertOp(topOfBlock); |
2275 insertOp(topOfBlock); | 2304 insertOp(topOfBlock); |
2276 insertOp(topOfBlock); | 2305 insertOp(topOfBlock); |
2277 | 2306 |
2278 // The operands for the CTR_INIT opcode include the index in the matcher dat
a | 2307 // The operands for the CTR_INIT opcode include the index in the matcher dat
a |
2279 // of the counter. Allocate it now. There are two data items | 2308 // of the counter. Allocate it now. There are two data items |
2280 // counterLoc --> Loop counter | 2309 // counterLoc --> Loop counter |
2281 // +1 --> Input index (for breaking non-progressing loops) | 2310 // +1 --> Input index (for breaking non-progressing loops) |
2282 // (Only present if unbounded upper limit on loop) | 2311 // (Only present if unbounded upper limit on loop) |
2283 int32_t counterLoc = fRXPat->fFrameSize; | 2312 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; |
2284 fRXPat->fFrameSize++; | 2313 int32_t counterLoc = allocateStackData(dataSize); |
2285 if (fIntervalUpper < 0) { | |
2286 fRXPat->fFrameSize++; | |
2287 } | |
2288 | 2314 |
2289 int32_t op = URX_BUILD(InitOp, counterLoc); | 2315 int32_t op = buildOp(InitOp, counterLoc); |
2290 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | 2316 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); |
2291 | 2317 |
2292 // The second operand of CTR_INIT is the location following the end of the l
oop. | 2318 // The second operand of CTR_INIT is the location following the end of the l
oop. |
2293 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the | 2319 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the |
2294 // compilation of something later on causes the code to grow and the targe
t | 2320 // compilation of something later on causes the code to grow and the targe
t |
2295 // position to move. | 2321 // position to move. |
2296 int32_t loopEnd = fRXPat->fCompiledPat->size(); | 2322 int32_t loopEnd = fRXPat->fCompiledPat->size(); |
2297 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | 2323 op = buildOp(URX_RELOC_OPRND, loopEnd); |
2298 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | 2324 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); |
2299 | 2325 |
2300 // Followed by the min and max counts. | 2326 // Followed by the min and max counts. |
2301 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); | 2327 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); |
2302 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); | 2328 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); |
2303 | 2329 |
2304 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. | 2330 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. |
2305 // Goes at end of the block being looped over, so just append to the code
so far. | 2331 // Goes at end of the block being looped over, so just append to the code
so far. |
2306 op = URX_BUILD(LoopOp, topOfBlock); | 2332 appendOp(LoopOp, topOfBlock); |
2307 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2308 | 2333 |
2309 if ((fIntervalLow & 0xff000000) != 0 || | 2334 if ((fIntervalLow & 0xff000000) != 0 || |
2310 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { | 2335 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { |
2311 error(U_REGEX_NUMBER_TOO_BIG); | 2336 error(U_REGEX_NUMBER_TOO_BIG); |
2312 } | 2337 } |
2313 | 2338 |
2314 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { | 2339 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { |
2315 error(U_REGEX_MAX_LT_MIN); | 2340 error(U_REGEX_MAX_LT_MIN); |
2316 } | 2341 } |
2317 } | 2342 } |
2318 | 2343 |
2319 | 2344 |
2320 | 2345 |
2321 UBool RegexCompile::compileInlineInterval() { | 2346 UBool RegexCompile::compileInlineInterval() { |
2322 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { | 2347 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { |
2323 // Too big to inline. Fail, which will cause looping code to be generat
ed. | 2348 // Too big to inline. Fail, which will cause looping code to be generat
ed. |
2324 // (Upper < Lower picks up unbounded upper and errors, both.) | 2349 // (Upper < Lower picks up unbounded upper and errors, both.) |
2325 return FALSE; | 2350 return FALSE; |
2326 } | 2351 } |
2327 | 2352 |
2328 int32_t topOfBlock = blockTopLoc(FALSE); | 2353 int32_t topOfBlock = blockTopLoc(FALSE); |
2329 if (fIntervalUpper == 0) { | 2354 if (fIntervalUpper == 0) { |
2330 // Pathological case. Attempt no matches, as if the block doesn't exist
. | 2355 // Pathological case. Attempt no matches, as if the block doesn't exist
. |
| 2356 // Discard the generated code for the block. |
| 2357 // If the block included parens, discard the info pertaining to them as
well. |
2331 fRXPat->fCompiledPat->setSize(topOfBlock); | 2358 fRXPat->fCompiledPat->setSize(topOfBlock); |
| 2359 if (fMatchOpenParen >= topOfBlock) { |
| 2360 fMatchOpenParen = -1; |
| 2361 } |
| 2362 if (fMatchCloseParen >= topOfBlock) { |
| 2363 fMatchCloseParen = -1; |
| 2364 } |
2332 return TRUE; | 2365 return TRUE; |
2333 } | 2366 } |
2334 | 2367 |
2335 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { | 2368 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { |
2336 // The thing being repeated is not a single op, but some | 2369 // The thing being repeated is not a single op, but some |
2337 // more complex block. Do it as a loop, not inlines. | 2370 // more complex block. Do it as a loop, not inlines. |
2338 // Note that things "repeated" a max of once are handled as inline, be
cause | 2371 // Note that things "repeated" a max of once are handled as inline, be
cause |
2339 // the one copy of the code already generated is just fine. | 2372 // the one copy of the code already generated is just fine. |
2340 return FALSE; | 2373 return FALSE; |
2341 } | 2374 } |
2342 | 2375 |
2343 // Pick up the opcode that is to be repeated | 2376 // Pick up the opcode that is to be repeated |
2344 // | 2377 // |
2345 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); | 2378 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); |
2346 | 2379 |
2347 // Compute the pattern location where the inline sequence | 2380 // Compute the pattern location where the inline sequence |
2348 // will end, and set up the state save op that will be needed. | 2381 // will end, and set up the state save op that will be needed. |
2349 // | 2382 // |
2350 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 | 2383 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 |
2351 + fIntervalUpper + (fIntervalUpper-fIntervalLow)
; | 2384 + fIntervalUpper + (fIntervalUpper-fIntervalLow)
; |
2352 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); | 2385 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); |
2353 if (fIntervalLow == 0) { | 2386 if (fIntervalLow == 0) { |
2354 insertOp(topOfBlock); | 2387 insertOp(topOfBlock); |
2355 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); | 2388 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); |
2356 } | 2389 } |
2357 | 2390 |
2358 | 2391 |
2359 | 2392 |
2360 // Loop, emitting the op for the thing being repeated each time. | 2393 // Loop, emitting the op for the thing being repeated each time. |
2361 // Loop starts at 1 because one instance of the op already exists in the
pattern, | 2394 // Loop starts at 1 because one instance of the op already exists in the
pattern, |
2362 // it was put there when it was originally encountered. | 2395 // it was put there when it was originally encountered. |
2363 int32_t i; | 2396 int32_t i; |
2364 for (i=1; i<fIntervalUpper; i++ ) { | 2397 for (i=1; i<fIntervalUpper; i++ ) { |
2365 if (i == fIntervalLow) { | 2398 if (i >= fIntervalLow) { |
2366 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | 2399 appendOp(saveOp); |
2367 } | 2400 } |
2368 if (i > fIntervalLow) { | 2401 appendOp(op); |
2369 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | |
2370 } | |
2371 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
2372 } | 2402 } |
2373 return TRUE; | 2403 return TRUE; |
2374 } | 2404 } |
2375 | 2405 |
2376 | 2406 |
2377 | 2407 |
2378 //------------------------------------------------------------------------------ | 2408 //------------------------------------------------------------------------------ |
2379 // | 2409 // |
2380 // caseInsensitiveStart given a single code point from a pattern string, dete
rmine the | 2410 // caseInsensitiveStart given a single code point from a pattern string, dete
rmine the |
2381 // set of characters that could potentially begin a case
-insensitive | 2411 // set of characters that could potentially begin a case
-insensitive |
(...skipping 1198 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3580 case URX_CTR_LOOP_NG: | 3610 case URX_CTR_LOOP_NG: |
3581 case URX_RELOC_OPRND: | 3611 case URX_RELOC_OPRND: |
3582 case URX_JMPX: | 3612 case URX_JMPX: |
3583 case URX_JMP_SAV: | 3613 case URX_JMP_SAV: |
3584 case URX_JMP_SAV_X: | 3614 case URX_JMP_SAV_X: |
3585 // These are instructions with operands that refer to code locations
. | 3615 // These are instructions with operands that refer to code locations
. |
3586 { | 3616 { |
3587 int32_t operandAddress = URX_VAL(op); | 3617 int32_t operandAddress = URX_VAL(op); |
3588 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); | 3618 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); |
3589 int32_t fixedOperandAddress = operandAddress - deltas.elementAti
(operandAddress); | 3619 int32_t fixedOperandAddress = operandAddress - deltas.elementAti
(operandAddress); |
3590 op = URX_BUILD(opType, fixedOperandAddress); | 3620 op = buildOp(opType, fixedOperandAddress); |
3591 fRXPat->fCompiledPat->setElementAt(op, dst); | 3621 fRXPat->fCompiledPat->setElementAt(op, dst); |
3592 dst++; | 3622 dst++; |
3593 break; | 3623 break; |
3594 } | 3624 } |
3595 | 3625 |
3596 case URX_BACKREF: | 3626 case URX_BACKREF: |
3597 case URX_BACKREF_I: | 3627 case URX_BACKREF_I: |
3598 { | 3628 { |
3599 int32_t where = URX_VAL(op); | 3629 int32_t where = URX_VAL(op); |
3600 if (where > fRXPat->fGroupMap->size()) { | 3630 if (where > fRXPat->fGroupMap->size()) { |
3601 error(U_REGEX_INVALID_BACK_REF); | 3631 error(U_REGEX_INVALID_BACK_REF); |
3602 break; | 3632 break; |
3603 } | 3633 } |
3604 where = fRXPat->fGroupMap->elementAti(where-1); | 3634 where = fRXPat->fGroupMap->elementAti(where-1); |
3605 op = URX_BUILD(opType, where); | 3635 op = buildOp(opType, where); |
3606 fRXPat->fCompiledPat->setElementAt(op, dst); | 3636 fRXPat->fCompiledPat->setElementAt(op, dst); |
3607 dst++; | 3637 dst++; |
3608 | 3638 |
3609 fRXPat->fNeedsAltInput = TRUE; | 3639 fRXPat->fNeedsAltInput = TRUE; |
3610 break; | 3640 break; |
3611 } | 3641 } |
3612 case URX_RESERVED_OP: | 3642 case URX_RESERVED_OP: |
3613 case URX_RESERVED_OP_N: | 3643 case URX_RESERVED_OP_N: |
3614 case URX_BACKTRACK: | 3644 case URX_BACKTRACK: |
3615 case URX_END: | 3645 case URX_END: |
(...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3947 fEOLComments = TRUE; | 3977 fEOLComments = TRUE; |
3948 | 3978 |
3949 // putc(c.fChar, stdout); | 3979 // putc(c.fChar, stdout); |
3950 } | 3980 } |
3951 | 3981 |
3952 | 3982 |
3953 | 3983 |
3954 //------------------------------------------------------------------------------ | 3984 //------------------------------------------------------------------------------ |
3955 // | 3985 // |
3956 // scanNamedChar | 3986 // scanNamedChar |
3957 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. | 3987 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. |
3958 // | 3988 // |
3959 // The scan position will be at the 'N'. On return | 3989 // The scan position will be at the 'N'. On return |
3960 // the scan position should be just after the '}' | 3990 // the scan position should be just after the '}' |
3961 // | 3991 // |
3962 // Return the UChar32 | 3992 // Return the UChar32 |
3963 // | 3993 // |
3964 //------------------------------------------------------------------------------ | 3994 //------------------------------------------------------------------------------ |
3965 UChar32 RegexCompile::scanNamedChar() { | 3995 UChar32 RegexCompile::scanNamedChar() { |
3966 if (U_FAILURE(*fStatus)) { | 3996 if (U_FAILURE(*fStatus)) { |
3967 return 0; | 3997 return 0; |
(...skipping 451 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4419 | 4449 |
4420 void RegexCompile::setPushOp(int32_t op) { | 4450 void RegexCompile::setPushOp(int32_t op) { |
4421 setEval(op); | 4451 setEval(op); |
4422 fSetOpStack.push(op, *fStatus); | 4452 fSetOpStack.push(op, *fStatus); |
4423 fSetStack.push(new UnicodeSet(), *fStatus); | 4453 fSetStack.push(new UnicodeSet(), *fStatus); |
4424 } | 4454 } |
4425 | 4455 |
4426 U_NAMESPACE_END | 4456 U_NAMESPACE_END |
4427 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 4457 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
4428 | 4458 |
OLD | NEW |