OLD | NEW |
1 // | 1 // |
2 // file: regexcmp.cpp | 2 // file: regexcmp.cpp |
3 // | 3 // |
4 // Copyright (C) 2002-2013 International Business Machines Corporation and othe
rs. | 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe
rs. |
5 // All Rights Reserved. | 5 // All Rights Reserved. |
6 // | 6 // |
7 // This file contains the ICU regular expression compiler, which is responsible | 7 // This file contains the ICU regular expression compiler, which is responsible |
8 // for processing a regular expression pattern into the compiled form that | 8 // for processing a regular expression pattern into the compiled form that |
9 // is used by the match finding engine. | 9 // is used by the match finding engine. |
10 // | 10 // |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
15 | 15 |
16 #include "unicode/ustring.h" | 16 #include "unicode/ustring.h" |
17 #include "unicode/unistr.h" | 17 #include "unicode/unistr.h" |
18 #include "unicode/uniset.h" | 18 #include "unicode/uniset.h" |
19 #include "unicode/uchar.h" | 19 #include "unicode/uchar.h" |
20 #include "unicode/uchriter.h" | 20 #include "unicode/uchriter.h" |
21 #include "unicode/parsepos.h" | 21 #include "unicode/parsepos.h" |
22 #include "unicode/parseerr.h" | 22 #include "unicode/parseerr.h" |
23 #include "unicode/regex.h" | 23 #include "unicode/regex.h" |
24 #include "unicode/utf.h" | 24 #include "unicode/utf.h" |
25 #include "unicode/utf16.h" | 25 #include "unicode/utf16.h" |
26 #include "patternprops.h" | 26 #include "patternprops.h" |
27 #include "putilimp.h" | 27 #include "putilimp.h" |
28 #include "cmemory.h" | 28 #include "cmemory.h" |
29 #include "cstring.h" | 29 #include "cstring.h" |
30 #include "uvectr32.h" | 30 #include "uvectr32.h" |
31 #include "uvectr64.h" | 31 #include "uvectr64.h" |
32 #include "uassert.h" | 32 #include "uassert.h" |
33 #include "ucln_in.h" | |
34 #include "uinvchar.h" | 33 #include "uinvchar.h" |
35 | 34 |
36 #include "regeximp.h" | 35 #include "regeximp.h" |
37 #include "regexcst.h" // Contains state table for the regex pattern parser. | 36 #include "regexcst.h" // Contains state table for the regex pattern parser. |
38 // generated by a Perl script. | 37 // generated by a Perl script. |
39 #include "regexcmp.h" | 38 #include "regexcmp.h" |
40 #include "regexst.h" | 39 #include "regexst.h" |
41 #include "regextxt.h" | 40 #include "regextxt.h" |
42 | 41 |
43 | 42 |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
102 // | 101 // |
103 //------------------------------------------------------------------------------ | 102 //------------------------------------------------------------------------------ |
104 void RegexCompile::compile( | 103 void RegexCompile::compile( |
105 const UnicodeString &pat, // Source pat to be compile
d. | 104 const UnicodeString &pat, // Source pat to be compile
d. |
106 UParseError &pp, // Error position info | 105 UParseError &pp, // Error position info |
107 UErrorCode &e) // Error Code | 106 UErrorCode &e) // Error Code |
108 { | 107 { |
109 fRXPat->fPatternString = new UnicodeString(pat); | 108 fRXPat->fPatternString = new UnicodeString(pat); |
110 UText patternText = UTEXT_INITIALIZER; | 109 UText patternText = UTEXT_INITIALIZER; |
111 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); | 110 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); |
112 | 111 |
113 if (U_SUCCESS(e)) { | 112 if (U_SUCCESS(e)) { |
114 compile(&patternText, pp, e); | 113 compile(&patternText, pp, e); |
115 utext_close(&patternText); | 114 utext_close(&patternText); |
116 } | 115 } |
117 } | 116 } |
118 | 117 |
119 // | 118 // |
120 // compile, UText mode | 119 // compile, UText mode |
121 // All the work is actually done here. | 120 // All the work is actually done here. |
122 // | 121 // |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
295 n *= 10; | 294 n *= 10; |
296 } | 295 } |
297 | 296 |
298 // | 297 // |
299 // The pattern's fFrameSize so far has accumulated the requirements for | 298 // The pattern's fFrameSize so far has accumulated the requirements for |
300 // storage for capture parentheses, counters, etc. that are encountered | 299 // storage for capture parentheses, counters, etc. that are encountered |
301 // in the pattern. Add space for the two variables that are always | 300 // in the pattern. Add space for the two variables that are always |
302 // present in the saved state: the input string position (int64_t) and | 301 // present in the saved state: the input string position (int64_t) and |
303 // the position in the compiled pattern. | 302 // the position in the compiled pattern. |
304 // | 303 // |
305 allocateStackData(RESTACKFRAME_HDRCOUNT); | 304 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; |
306 | 305 |
307 // | 306 // |
308 // Optimization pass 1: NOPs, back-references, and case-folding | 307 // Optimization pass 1: NOPs, back-references, and case-folding |
309 // | 308 // |
310 stripNOPs(); | 309 stripNOPs(); |
311 | 310 |
312 // | 311 // |
313 // Get bounds for the minimum and maximum length of a string that this | 312 // Get bounds for the minimum and maximum length of a string that this |
314 // pattern can match. Used to avoid looking for matches in strings that | 313 // pattern can match. Used to avoid looking for matches in strings that |
315 // are too short. | 314 // are too short. |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
361 | 360 |
362 case doPatStart: | 361 case doPatStart: |
363 // Start of pattern compiles to: | 362 // Start of pattern compiles to: |
364 //0 SAVE 2 Fall back to position of FAIL | 363 //0 SAVE 2 Fall back to position of FAIL |
365 //1 jmp 3 | 364 //1 jmp 3 |
366 //2 FAIL Stop if we ever reach here. | 365 //2 FAIL Stop if we ever reach here. |
367 //3 NOP Dummy, so start of pattern looks the same as | 366 //3 NOP Dummy, so start of pattern looks the same as |
368 // the start of an ( grouping. | 367 // the start of an ( grouping. |
369 //4 NOP Resreved, will be replaced by a save if there are | 368 //4 NOP Resreved, will be replaced by a save if there are |
370 // OR | operators at the top level | 369 // OR | operators at the top level |
371 appendOp(URX_BUILD(URX_STATE_SAVE, 2)); | 370 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus)
; |
372 appendOp(URX_BUILD(URX_JMP, 3)); | 371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); |
373 appendOp(URX_BUILD(URX_FAIL, 0)); | 372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); |
374 | 373 |
375 // Standard open nonCapture paren action emits the two NOPs and | 374 // Standard open nonCapture paren action emits the two NOPs and |
376 // sets up the paren stack frame. | 375 // sets up the paren stack frame. |
377 doParseActions(doOpenNonCaptureParen); | 376 doParseActions(doOpenNonCaptureParen); |
378 break; | 377 break; |
379 | 378 |
380 case doPatFinish: | 379 case doPatFinish: |
381 // We've scanned to the end of the pattern | 380 // We've scanned to the end of the pattern |
382 // The end of pattern compiles to: | 381 // The end of pattern compiles to: |
383 // URX_END | 382 // URX_END |
384 // which will stop the runtime match engine. | 383 // which will stop the runtime match engine. |
385 // Encountering end of pattern also behaves like a close paren, | 384 // Encountering end of pattern also behaves like a close paren, |
386 // and forces fixups of the State Save at the beginning of the compile
d pattern | 385 // and forces fixups of the State Save at the beginning of the compile
d pattern |
387 // and of any OR operations at the top level. | 386 // and of any OR operations at the top level. |
388 // | 387 // |
389 handleCloseParen(); | 388 handleCloseParen(); |
390 if (fParenStack.size() > 0) { | 389 if (fParenStack.size() > 0) { |
391 // Missing close paren in pattern. | 390 // Missing close paren in pattern. |
392 error(U_REGEX_MISMATCHED_PAREN); | 391 error(U_REGEX_MISMATCHED_PAREN); |
393 } | 392 } |
394 | 393 |
395 // add the END operation to the compiled pattern. | 394 // add the END operation to the compiled pattern. |
396 appendOp(URX_BUILD(URX_END, 0)); | 395 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); |
397 | 396 |
398 // Terminate the pattern compilation state machine. | 397 // Terminate the pattern compilation state machine. |
399 returnVal = FALSE; | 398 returnVal = FALSE; |
400 break; | 399 break; |
401 | 400 |
402 | 401 |
403 | 402 |
404 case doOrOperator: | 403 case doOrOperator: |
405 // Scanning a '|', as in (A|B) | 404 // Scanning a '|', as in (A|B) |
406 { | 405 { |
407 // Generate code for any pending literals preceding the '|' | 406 // Generate code for any pending literals preceding the '|' |
408 fixLiterals(FALSE); | 407 fixLiterals(FALSE); |
409 | 408 |
410 // Insert a SAVE operation at the start of the pattern section prece
ding | 409 // Insert a SAVE operation at the start of the pattern section prece
ding |
411 // this OR at this level. This SAVE will branch the match forward | 410 // this OR at this level. This SAVE will branch the match forward |
412 // to the right hand side of the OR in the event that the left han
d | 411 // to the right hand side of the OR in the event that the left han
d |
413 // side fails to match and backtracks. Locate the position for th
e | 412 // side fails to match and backtracks. Locate the position for th
e |
414 // save from the location on the top of the parentheses stack. | 413 // save from the location on the top of the parentheses stack. |
415 int32_t savePosition = fParenStack.popi(); | 414 int32_t savePosition = fParenStack.popi(); |
416 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; | 415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition)
; |
417 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location | 416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved
location |
418 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | 417 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
419 fRXPat->fCompiledPat->setElementAt(op, savePosition); | 418 fRXPat->fCompiledPat->setElementAt(op, savePosition); |
420 | 419 |
421 // Append an JMP operation into the compiled pattern. The operand f
or | 420 // Append an JMP operation into the compiled pattern. The operand f
or |
422 // the JMP will eventually be the location following the ')' for th
e | 421 // the JMP will eventually be the location following the ')' for th
e |
423 // group. This will be patched in later, when the ')' is encounter
ed. | 422 // group. This will be patched in later, when the ')' is encounter
ed. |
424 op = URX_BUILD(URX_JMP, 0); | 423 op = URX_BUILD(URX_JMP, 0); |
425 appendOp(op); | 424 fRXPat->fCompiledPat->addElement(op, *fStatus); |
426 | 425 |
427 // Push the position of the newly added JMP op onto the parentheses
stack. | 426 // Push the position of the newly added JMP op onto the parentheses
stack. |
428 // This registers if for fixup when this block's close paren is enco
untered. | 427 // This registers if for fixup when this block's close paren is enco
untered. |
429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
430 | 429 |
431 // Append a NOP to the compiled pattern. This is the slot reserved | 430 // Append a NOP to the compiled pattern. This is the slot reserved |
432 // for a SAVE in the event that there is yet another '|' following | 431 // for a SAVE in the event that there is yet another '|' following |
433 // this one. | 432 // this one. |
434 appendOp(URX_BUILD(URX_NOP, 0)); | 433 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
435 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | 434 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
436 } | 435 } |
437 break; | 436 break; |
438 | 437 |
439 | 438 |
440 case doOpenCaptureParen: | 439 case doOpenCaptureParen: |
441 // Open Paren. | 440 // Open Paren. |
442 // Compile to a | 441 // Compile to a |
443 // - NOP, which later may be replaced by a save-state if the | 442 // - NOP, which later may be replaced by a save-state if the |
444 // parenthesized group gets a * quantifier, followed by | 443 // parenthesized group gets a * quantifier, followed by |
445 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. | 444 // - START_CAPTURE n where n is stack frame offset to the captu
re group variables. |
446 // - NOP, which may later be replaced by a save-state if there | 445 // - NOP, which may later be replaced by a save-state if there |
447 // is an '|' alternation within the parens. | 446 // is an '|' alternation within the parens. |
448 // | 447 // |
449 // Each capture group gets three slots in the save stack frame: | 448 // Each capture group gets three slots in the save stack frame: |
450 // 0: Capture Group start position (in input string being matche
d.) | 449 // 0: Capture Group start position (in input string being matche
d.) |
451 // 1: Capture Group end position. | 450 // 1: Capture Group end position. |
452 // 2: Start of Match-in-progress. | 451 // 2: Start of Match-in-progress. |
453 // The first two locations are for a completed capture group, and are | 452 // The first two locations are for a completed capture group, and are |
454 // referred to by back references and the like. | 453 // referred to by back references and the like. |
455 // The third location stores the capture start position when an START
_CAPTURE is | 454 // The third location stores the capture start position when an START
_CAPTURE is |
456 // encountered. This will be promoted to a completed capture when
(and if) the corresponding | 455 // encountered. This will be promoted to a completed capture when
(and if) the corresponding |
457 // END_CAPTURE is encountered. | 456 // END_CAPTURE is encountered. |
458 { | 457 { |
459 fixLiterals(); | 458 fixLiterals(); |
460 appendOp(URX_BUILD(URX_NOP, 0)); | 459 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
461 int32_t varsLoc = allocateStackData(3); // Reserve three slots i
n match stack frame. | 460 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. |
462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); | 461 fRXPat->fFrameSize += 3; |
463 appendOp(cop); | 462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); |
464 appendOp(URX_BUILD(URX_NOP, 0)); | 463 fRXPat->fCompiledPat->addElement(cop, *fStatus); |
| 464 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
465 | 465 |
466 // On the Parentheses stack, start a new frame and add the postions | 466 // On the Parentheses stack, start a new frame and add the postions |
467 // of the two NOPs. Depending on what follows in the pattern, the | 467 // of the two NOPs. Depending on what follows in the pattern, the |
468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
469 // address of the end of the parenthesized group. | 469 // address of the end of the parenthesized group. |
470 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 470 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
471 fParenStack.push(capturing, *fStatus); // Fra
me type. | 471 fParenStack.push(capturing, *fStatus); // Fra
me type. |
472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location | 472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP location |
473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
474 | 474 |
475 // Save the mapping from group number to stack frame variable positi
on. | 475 // Save the mapping from group number to stack frame variable positi
on. |
476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); | 476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); |
477 } | 477 } |
478 break; | 478 break; |
479 | 479 |
480 case doOpenNonCaptureParen: | 480 case doOpenNonCaptureParen: |
481 // Open non-caputuring (grouping only) Paren. | 481 // Open non-caputuring (grouping only) Paren. |
482 // Compile to a | 482 // Compile to a |
483 // - NOP, which later may be replaced by a save-state if the | 483 // - NOP, which later may be replaced by a save-state if the |
484 // parenthesized group gets a * quantifier, followed by | 484 // parenthesized group gets a * quantifier, followed by |
485 // - NOP, which may later be replaced by a save-state if there | 485 // - NOP, which may later be replaced by a save-state if there |
486 // is an '|' alternation within the parens. | 486 // is an '|' alternation within the parens. |
487 { | 487 { |
488 fixLiterals(); | 488 fixLiterals(); |
489 appendOp(URX_BUILD(URX_NOP, 0)); | 489 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
490 appendOp(URX_BUILD(URX_NOP, 0)); | 490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
491 | 491 |
492 // On the Parentheses stack, start a new frame and add the postions | 492 // On the Parentheses stack, start a new frame and add the postions |
493 // of the two NOPs. | 493 // of the two NOPs. |
494 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 494 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
495 fParenStack.push(plain, *fStatus); // Beg
in a new frame. | 495 fParenStack.push(plain, *fStatus); // Beg
in a new frame. |
496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc | 497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP loc |
498 } | 498 } |
499 break; | 499 break; |
500 | 500 |
501 | 501 |
502 case doOpenAtomicParen: | 502 case doOpenAtomicParen: |
503 // Open Atomic Paren. (?> | 503 // Open Atomic Paren. (?> |
504 // Compile to a | 504 // Compile to a |
505 // - NOP, which later may be replaced if the parenthesized group | 505 // - NOP, which later may be replaced if the parenthesized group |
506 // has a quantifier, followed by | 506 // has a quantifier, followed by |
507 // - STO_SP save state stack position, so it can be restored at th
e ")" | 507 // - STO_SP save state stack position, so it can be restored at th
e ")" |
508 // - NOP, which may later be replaced by a save-state if there | 508 // - NOP, which may later be replaced by a save-state if there |
509 // is an '|' alternation within the parens. | 509 // is an '|' alternation within the parens. |
510 { | 510 { |
511 fixLiterals(); | 511 fixLiterals(); |
512 appendOp(URX_BUILD(URX_NOP, 0)); | 512 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
513 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the state stack ptr. | 513 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the |
514 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); | 514 fRXPat->fDataSize += 1; // state stack ptr. |
515 appendOp(stoOp); | 515 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); |
516 appendOp(URX_BUILD(URX_NOP, 0)); | 516 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); |
| 517 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
517 | 518 |
518 // On the Parentheses stack, start a new frame and add the postions | 519 // On the Parentheses stack, start a new frame and add the postions |
519 // of the two NOPs. Depending on what follows in the pattern, the | 520 // of the two NOPs. Depending on what follows in the pattern, the |
520 // NOPs may be changed to SAVE_STATE or JMP ops, with a target | 521 // NOPs may be changed to SAVE_STATE or JMP ops, with a target |
521 // address of the end of the parenthesized group. | 522 // address of the end of the parenthesized group. |
522 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 523 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
523 fParenStack.push(atomic, *fStatus); // Fra
me type. | 524 fParenStack.push(atomic, *fStatus); // Fra
me type. |
524 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP | 525 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The
first NOP |
525 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
526 } | 527 } |
(...skipping 22 matching lines...) Expand all Loading... |
549 // 6. NOP reserved for use by quantifiers on the block
. | 550 // 6. NOP reserved for use by quantifiers on the block
. |
550 // Look-ahead can't have quantifiers, but paren
stack | 551 // Look-ahead can't have quantifiers, but paren
stack |
551 // compile time conventions require the slot
anyhow. | 552 // compile time conventions require the slot
anyhow. |
552 // 7. NOP may be replaced if there is are '|' ops in t
he block. | 553 // 7. NOP may be replaced if there is are '|' ops in t
he block. |
553 // 8. code for parenthesized stuff. | 554 // 8. code for parenthesized stuff. |
554 // 9. LA_END | 555 // 9. LA_END |
555 // | 556 // |
556 // Two data slots are reserved, for saving the stack ptr and the input
position. | 557 // Two data slots are reserved, for saving the stack ptr and the input
position. |
557 { | 558 { |
558 fixLiterals(); | 559 fixLiterals(); |
559 int32_t dataLoc = allocateData(2); | 560 int32_t dataLoc = fRXPat->fDataSize; |
| 561 fRXPat->fDataSize += 2; |
560 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 562 int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
561 appendOp(op); | 563 fRXPat->fCompiledPat->addElement(op, *fStatus); |
562 | 564 |
563 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | 565 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
564 appendOp(op); | 566 fRXPat->fCompiledPat->addElement(op, *fStatus); |
565 | 567 |
566 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | 568 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
567 appendOp(op); | 569 fRXPat->fCompiledPat->addElement(op, *fStatus); |
568 | 570 |
569 op = URX_BUILD(URX_LA_END, dataLoc); | 571 op = URX_BUILD(URX_LA_END, dataLoc); |
570 appendOp(op); | 572 fRXPat->fCompiledPat->addElement(op, *fStatus); |
571 | 573 |
572 op = URX_BUILD(URX_BACKTRACK, 0); | 574 op = URX_BUILD(URX_BACKTRACK, 0); |
573 appendOp(op); | 575 fRXPat->fCompiledPat->addElement(op, *fStatus); |
574 | 576 |
575 op = URX_BUILD(URX_NOP, 0); | 577 op = URX_BUILD(URX_NOP, 0); |
576 appendOp(op); | 578 fRXPat->fCompiledPat->addElement(op, *fStatus); |
577 appendOp(op); | 579 fRXPat->fCompiledPat->addElement(op, *fStatus); |
578 | 580 |
579 // On the Parentheses stack, start a new frame and add the postions | 581 // On the Parentheses stack, start a new frame and add the postions |
580 // of the NOPs. | 582 // of the NOPs. |
581 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 583 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
582 fParenStack.push(lookAhead, *fStatus); // Fra
me type. | 584 fParenStack.push(lookAhead, *fStatus); // Fra
me type. |
583 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 585 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
584 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 586 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
585 } | 587 } |
586 break; | 588 break; |
587 | 589 |
588 case doOpenLookAheadNeg: | 590 case doOpenLookAheadNeg: |
589 // Negated Lookahead. (?! stuff ) | 591 // Negated Lookahead. (?! stuff ) |
590 // Compiles to | 592 // Compiles to |
591 // 1. START_LA dataloc | 593 // 1. START_LA dataloc |
592 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, | 594 // 2. SAVE_STATE 7 // Fail within look-ahead block restor
es to this state, |
593 // // which continues with the match. | 595 // // which continues with the match. |
594 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' | 596 // 3. NOP // Std. Open Paren sequence, for possi
ble '|' |
595 // 4. code for parenthesized stuff. | 597 // 4. code for parenthesized stuff. |
596 // 5. END_LA // Cut back stack, remove saved state
from step 2. | 598 // 5. END_LA // Cut back stack, remove saved state
from step 2. |
597 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. | 599 // 6. BACKTRACK // code in block succeeded, so neg. lo
okahead fails. |
598 // 7. END_LA // Restore match region, in case look-
ahead was using | 600 // 7. END_LA // Restore match region, in case look-
ahead was using |
599 // an alternate (transparent) reg
ion. | 601 // an alternate (transparent) reg
ion. |
600 { | 602 { |
601 fixLiterals(); | 603 fixLiterals(); |
602 int32_t dataLoc = allocateData(2); | 604 int32_t dataLoc = fRXPat->fDataSize; |
| 605 fRXPat->fDataSize += 2; |
603 int32_t op = URX_BUILD(URX_LA_START, dataLoc); | 606 int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
604 appendOp(op); | 607 fRXPat->fCompiledPat->addElement(op, *fStatus); |
605 | 608 |
606 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. | 609 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche
d later. |
607 appendOp(op); | 610 fRXPat->fCompiledPat->addElement(op, *fStatus); |
608 | 611 |
609 op = URX_BUILD(URX_NOP, 0); | 612 op = URX_BUILD(URX_NOP, 0); |
610 appendOp(op); | 613 fRXPat->fCompiledPat->addElement(op, *fStatus); |
611 | 614 |
612 // On the Parentheses stack, start a new frame and add the postions | 615 // On the Parentheses stack, start a new frame and add the postions |
613 // of the StateSave and NOP. | 616 // of the StateSave and NOP. |
614 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 617 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
615 fParenStack.push(negLookAhead, *fStatus); // Fram
e type | 618 fParenStack.push(negLookAhead, *fStatus); // Fram
e type |
616 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location | 619 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
STATE_SAVE location |
617 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location | 620 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP location |
618 | 621 |
619 // Instructions #5 - #7 will be added when the ')' is encountered. | 622 // Instructions #5 - #7 will be added when the ')' is encountered. |
620 } | 623 } |
(...skipping 17 matching lines...) Expand all Loading... |
638 // Allocate a block of matcher data, to contain (when runni
ng a match) | 641 // Allocate a block of matcher data, to contain (when runni
ng a match) |
639 // 0: Stack ptr on entry | 642 // 0: Stack ptr on entry |
640 // 1: Input Index on entry | 643 // 1: Input Index on entry |
641 // 2: Start index of match current match attempt. | 644 // 2: Start index of match current match attempt. |
642 // 3: Original Input String len. | 645 // 3: Original Input String len. |
643 | 646 |
644 // Generate match code for any pending literals. | 647 // Generate match code for any pending literals. |
645 fixLiterals(); | 648 fixLiterals(); |
646 | 649 |
647 // Allocate data space | 650 // Allocate data space |
648 int32_t dataLoc = allocateData(4); | 651 int32_t dataLoc = fRXPat->fDataSize; |
| 652 fRXPat->fDataSize += 4; |
649 | 653 |
650 // Emit URX_LB_START | 654 // Emit URX_LB_START |
651 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 655 int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
652 appendOp(op); | 656 fRXPat->fCompiledPat->addElement(op, *fStatus); |
653 | 657 |
654 // Emit URX_LB_CONT | 658 // Emit URX_LB_CONT |
655 op = URX_BUILD(URX_LB_CONT, dataLoc); | 659 op = URX_BUILD(URX_LB_CONT, dataLoc); |
656 appendOp(op); | 660 fRXPat->fCompiledPat->addElement(op, *fStatus); |
657 appendOp(0); // MinMatchLength. To be filled later. | 661 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. |
658 appendOp(0); // MaxMatchLength. To be filled later. | 662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. |
659 | 663 |
660 // Emit the NOP | 664 // Emit the NOP |
661 op = URX_BUILD(URX_NOP, 0); | 665 op = URX_BUILD(URX_NOP, 0); |
662 appendOp(op); | 666 fRXPat->fCompiledPat->addElement(op, *fStatus); |
663 appendOp(op); | 667 fRXPat->fCompiledPat->addElement(op, *fStatus); |
664 | 668 |
665 // On the Parentheses stack, start a new frame and add the postions | 669 // On the Parentheses stack, start a new frame and add the postions |
666 // of the URX_LB_CONT and the NOP. | 670 // of the URX_LB_CONT and the NOP. |
667 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 671 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
668 fParenStack.push(lookBehind, *fStatus); // Fra
me type | 672 fParenStack.push(lookBehind, *fStatus); // Fra
me type |
669 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 673 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
670 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 674 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
671 | 675 |
672 // The final two instructions will be added when the ')' is encounte
red. | 676 // The final two instructions will be added when the ')' is encounte
red. |
673 } | 677 } |
(...skipping 19 matching lines...) Expand all Loading... |
693 // Allocate a block of matcher data, to contain (when runni
ng a match) | 697 // Allocate a block of matcher data, to contain (when runni
ng a match) |
694 // 0: Stack ptr on entry | 698 // 0: Stack ptr on entry |
695 // 1: Input Index on entry | 699 // 1: Input Index on entry |
696 // 2: Start index of match current match attempt. | 700 // 2: Start index of match current match attempt. |
697 // 3: Original Input String len. | 701 // 3: Original Input String len. |
698 | 702 |
699 // Generate match code for any pending literals. | 703 // Generate match code for any pending literals. |
700 fixLiterals(); | 704 fixLiterals(); |
701 | 705 |
702 // Allocate data space | 706 // Allocate data space |
703 int32_t dataLoc = allocateData(4); | 707 int32_t dataLoc = fRXPat->fDataSize; |
| 708 fRXPat->fDataSize += 4; |
704 | 709 |
705 // Emit URX_LB_START | 710 // Emit URX_LB_START |
706 int32_t op = URX_BUILD(URX_LB_START, dataLoc); | 711 int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
707 appendOp(op); | 712 fRXPat->fCompiledPat->addElement(op, *fStatus); |
708 | 713 |
709 // Emit URX_LBN_CONT | 714 // Emit URX_LBN_CONT |
710 op = URX_BUILD(URX_LBN_CONT, dataLoc); | 715 op = URX_BUILD(URX_LBN_CONT, dataLoc); |
711 appendOp(op); | 716 fRXPat->fCompiledPat->addElement(op, *fStatus); |
712 appendOp(0); // MinMatchLength. To be filled later. | 717 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength
. To be filled later. |
713 appendOp(0); // MaxMatchLength. To be filled later. | 718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength
. To be filled later. |
714 appendOp(0); // Continue Loc. To be filled later. | 719 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. |
715 | 720 |
716 // Emit the NOP | 721 // Emit the NOP |
717 op = URX_BUILD(URX_NOP, 0); | 722 op = URX_BUILD(URX_NOP, 0); |
718 appendOp(op); | 723 fRXPat->fCompiledPat->addElement(op, *fStatus); |
719 appendOp(op); | 724 fRXPat->fCompiledPat->addElement(op, *fStatus); |
720 | 725 |
721 // On the Parentheses stack, start a new frame and add the postions | 726 // On the Parentheses stack, start a new frame and add the postions |
722 // of the URX_LB_CONT and the NOP. | 727 // of the URX_LB_CONT and the NOP. |
723 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state | 728 fParenStack.push(fModeFlags, *fStatus); // Mat
ch mode state |
724 fParenStack.push(lookBehindN, *fStatus); // Fra
me type | 729 fParenStack.push(lookBehindN, *fStatus); // Fra
me type |
725 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location | 730 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP location |
726 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location | 731 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
2nd NOP location |
727 | 732 |
728 // The final two instructions will be added when the ')' is encounte
red. | 733 // The final two instructions will be added when the ')' is encounte
red. |
729 } | 734 } |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
780 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 | 785 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 |
781 int32_t frameLoc; | 786 int32_t frameLoc; |
782 | 787 |
783 // Check for simple constructs, which may get special optimized code
. | 788 // Check for simple constructs, which may get special optimized code
. |
784 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 789 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
785 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 790 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
786 | 791 |
787 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 792 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
788 // Emit optimized code for [char set]+ | 793 // Emit optimized code for [char set]+ |
789 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 794 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); |
790 appendOp(loopOpI); | 795 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
791 frameLoc = allocateStackData(1); | 796 frameLoc = fRXPat->fFrameSize; |
| 797 fRXPat->fFrameSize++; |
792 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | 798 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
793 appendOp(loopOpC); | 799 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
794 break; | 800 break; |
795 } | 801 } |
796 | 802 |
797 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 803 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
798 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 804 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
799 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 805 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
800 // Emit Optimized code for .+ operations. | 806 // Emit Optimized code for .+ operations. |
801 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 807 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
802 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 808 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
803 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. | 809 // URX_LOOP_DOT_I operand is a flag indicating ". matche
s any" mode. |
804 loopOpI |= 1; | 810 loopOpI |= 1; |
805 } | 811 } |
806 if (fModeFlags & UREGEX_UNIX_LINES) { | 812 if (fModeFlags & UREGEX_UNIX_LINES) { |
807 loopOpI |= 2; | 813 loopOpI |= 2; |
808 } | 814 } |
809 appendOp(loopOpI); | 815 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
810 frameLoc = allocateStackData(1); | 816 frameLoc = fRXPat->fFrameSize; |
| 817 fRXPat->fFrameSize++; |
811 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | 818 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
812 appendOp(loopOpC); | 819 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
813 break; | 820 break; |
814 } | 821 } |
815 | 822 |
816 } | 823 } |
817 | 824 |
818 // General case. | 825 // General case. |
819 | 826 |
820 // Check for minimum match length of zero, which requires | 827 // Check for minimum match length of zero, which requires |
821 // extra loop-breaking code. | 828 // extra loop-breaking code. |
822 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { | 829 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { |
823 // Zero length match is possible. | 830 // Zero length match is possible. |
824 // Emit the code sequence that can handle it. | 831 // Emit the code sequence that can handle it. |
825 insertOp(topLoc); | 832 insertOp(topLoc); |
826 frameLoc = allocateStackData(1); | 833 frameLoc = fRXPat->fFrameSize; |
| 834 fRXPat->fFrameSize++; |
827 | 835 |
828 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | 836 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); |
829 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 837 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
830 | 838 |
831 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | 839 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); |
832 appendOp(op); | 840 fRXPat->fCompiledPat->addElement(op, *fStatus); |
833 } else { | 841 } else { |
834 // Simpler code when the repeated body must match something non-
empty | 842 // Simpler code when the repeated body must match something non-
empty |
835 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); | 843 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); |
836 appendOp(jmpOp); | 844 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
837 } | 845 } |
838 } | 846 } |
839 break; | 847 break; |
840 | 848 |
841 case doNGPlus: | 849 case doNGPlus: |
842 // Non-greedy '+?' compiles to | 850 // Non-greedy '+?' compiles to |
843 // 1. stuff to be repeated (already built) | 851 // 1. stuff to be repeated (already built) |
844 // 2. state-save 1 | 852 // 2. state-save 1 |
845 // 3. ... | 853 // 3. ... |
846 { | 854 { |
847 int32_t topLoc = blockTopLoc(FALSE); | 855 int32_t topLoc = blockTopLoc(FALSE); |
848 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | 856 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); |
849 appendOp(saveStateOp); | 857 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); |
850 } | 858 } |
851 break; | 859 break; |
852 | 860 |
853 | 861 |
854 case doOpt: | 862 case doOpt: |
855 // Normal (greedy) ? quantifier. | 863 // Normal (greedy) ? quantifier. |
856 // Compiles to | 864 // Compiles to |
857 // 1. state save 3 | 865 // 1. state save 3 |
858 // 2. body of optional block | 866 // 2. body of optional block |
859 // 3. ... | 867 // 3. ... |
(...skipping 16 matching lines...) Expand all Loading... |
876 // This code is less than ideal, with two jmps instead of one, because
we can only | 884 // This code is less than ideal, with two jmps instead of one, because
we can only |
877 // insert one instruction at the top of the block being iterated. | 885 // insert one instruction at the top of the block being iterated. |
878 { | 886 { |
879 int32_t jmp1_loc = blockTopLoc(TRUE); | 887 int32_t jmp1_loc = blockTopLoc(TRUE); |
880 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); | 888 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
881 | 889 |
882 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); | 890 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); |
883 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | 891 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
884 | 892 |
885 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); | 893 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); |
886 appendOp(jmp2_op); | 894 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); |
887 | 895 |
888 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | 896 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); |
889 appendOp(save_op); | 897 fRXPat->fCompiledPat->addElement(save_op, *fStatus); |
890 } | 898 } |
891 break; | 899 break; |
892 | 900 |
893 | 901 |
894 case doStar: | 902 case doStar: |
895 // Normal (greedy) * quantifier. | 903 // Normal (greedy) * quantifier. |
896 // Compiles to | 904 // Compiles to |
897 // 1. STATE_SAVE 4 | 905 // 1. STATE_SAVE 4 |
898 // 2. body of stuff being iterated over | 906 // 2. body of stuff being iterated over |
899 // 3. JMP_SAV 2 | 907 // 3. JMP_SAV 2 |
(...skipping 21 matching lines...) Expand all Loading... |
921 | 929 |
922 // Check for simple *, where the construct being repeated | 930 // Check for simple *, where the construct being repeated |
923 // compiled to single opcode, and might be optimizable. | 931 // compiled to single opcode, and might be optimizable. |
924 if (topLoc == fRXPat->fCompiledPat->size() - 1) { | 932 if (topLoc == fRXPat->fCompiledPat->size() - 1) { |
925 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); | 933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t
opLoc); |
926 | 934 |
927 if (URX_TYPE(repeatedOp) == URX_SETREF) { | 935 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
928 // Emit optimized code for a [char set]* | 936 // Emit optimized code for a [char set]* |
929 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); | 937 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO
p)); |
930 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
931 dataLoc = allocateStackData(1); | 939 dataLoc = fRXPat->fFrameSize; |
| 940 fRXPat->fFrameSize++; |
932 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | 941 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
933 appendOp(loopOpC); | 942 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
934 break; | 943 break; |
935 } | 944 } |
936 | 945 |
937 if (URX_TYPE(repeatedOp) == URX_DOTANY || | 946 if (URX_TYPE(repeatedOp) == URX_DOTANY || |
938 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | 947 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
939 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | 948 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
940 // Emit Optimized code for .* operations. | 949 // Emit Optimized code for .* operations. |
941 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | 950 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
942 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | 951 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
943 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. | 952 // URX_LOOP_DOT_I operand is a flag indicating . matches
any mode. |
944 loopOpI |= 1; | 953 loopOpI |= 1; |
945 } | 954 } |
946 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { | 955 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { |
947 loopOpI |= 2; | 956 loopOpI |= 2; |
948 } | 957 } |
949 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | 958 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
950 dataLoc = allocateStackData(1); | 959 dataLoc = fRXPat->fFrameSize; |
| 960 fRXPat->fFrameSize++; |
951 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | 961 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
952 appendOp(loopOpC); | 962 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
953 break; | 963 break; |
954 } | 964 } |
955 } | 965 } |
956 | 966 |
957 // Emit general case code for this * | 967 // Emit general case code for this * |
958 // The optimizations did not apply. | 968 // The optimizations did not apply. |
959 | 969 |
960 int32_t saveStateLoc = blockTopLoc(TRUE); | 970 int32_t saveStateLoc = blockTopLoc(TRUE); |
961 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | 971 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); |
962 | 972 |
963 // Check for minimum match length of zero, which requires | 973 // Check for minimum match length of zero, which requires |
964 // extra loop-breaking code. | 974 // extra loop-breaking code. |
965 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { | 975 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { |
966 insertOp(saveStateLoc); | 976 insertOp(saveStateLoc); |
967 dataLoc = allocateStackData(1); | 977 dataLoc = fRXPat->fFrameSize; |
| 978 fRXPat->fFrameSize++; |
968 | 979 |
969 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | 980 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); |
970 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | 981 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); |
971 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | 982 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); |
972 } | 983 } |
973 | 984 |
974 // Locate the position in the compiled pattern where the match will
continue | 985 // Locate the position in the compiled pattern where the match will
continue |
975 // after completing the *. (4 or 5 in the comment above) | 986 // after completing the *. (4 or 5 in the comment above) |
976 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 987 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
977 | 988 |
978 // Put together the save state op store it into the compiled code. | 989 // Put together the save state op store it into the compiled code. |
979 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | 990 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); |
980 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | 991 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
981 | 992 |
982 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. | 993 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt
ern. |
983 appendOp(jmpOp); | 994 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
984 } | 995 } |
985 break; | 996 break; |
986 | 997 |
987 case doNGStar: | 998 case doNGStar: |
988 // Non-greedy *? quantifier | 999 // Non-greedy *? quantifier |
989 // compiles to | 1000 // compiles to |
990 // 1. JMP 3 | 1001 // 1. JMP 3 |
991 // 2. body of stuff being iterated over | 1002 // 2. body of stuff being iterated over |
992 // 3. STATE_SAVE 2 | 1003 // 3. STATE_SAVE 2 |
993 // 4 ... | 1004 // 4 ... |
994 { | 1005 { |
995 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. | 1006 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1
. |
996 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. | 1007 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3
. |
997 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); | 1008 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); |
998 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | 1009 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); |
999 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | 1010 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
1000 appendOp(stateSaveOp); | 1011 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); |
1001 } | 1012 } |
1002 break; | 1013 break; |
1003 | 1014 |
1004 | 1015 |
1005 case doIntervalInit: | 1016 case doIntervalInit: |
1006 // The '{' opening an interval quantifier was just scanned. | 1017 // The '{' opening an interval quantifier was just scanned. |
1007 // Init the counter varaiables that will accumulate the values as the di
gits | 1018 // Init the counter varaiables that will accumulate the values as the di
gits |
1008 // are scanned. | 1019 // are scanned. |
1009 fIntervalLow = 0; | 1020 fIntervalLow = 0; |
1010 fIntervalUpper = -1; | 1021 fIntervalUpper = -1; |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1059 int32_t topLoc = blockTopLoc(FALSE); | 1070 int32_t topLoc = blockTopLoc(FALSE); |
1060 | 1071 |
1061 // Produce normal looping code. | 1072 // Produce normal looping code. |
1062 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); | 1073 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); |
1063 | 1074 |
1064 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP | 1075 // Surround the just-emitted normal looping code with a STO_SP ... L
D_SP |
1065 // just as if the loop was inclosed in atomic parentheses. | 1076 // just as if the loop was inclosed in atomic parentheses. |
1066 | 1077 |
1067 // First the STO_SP before the start of the loop | 1078 // First the STO_SP before the start of the loop |
1068 insertOp(topLoc); | 1079 insertOp(topLoc); |
1069 | 1080 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio
n for saving the |
1070 int32_t varLoc = allocateData(1); // Reserve a data location for
saving the | 1081 fRXPat->fDataSize += 1; // state stack ptr. |
1071 int32_t op = URX_BUILD(URX_STO_SP, varLoc); | 1082 int32_t op = URX_BUILD(URX_STO_SP, varLoc); |
1072 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1083 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1073 | 1084 |
1074 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); | 1085 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); |
1075 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); | 1086 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL
oc); |
1076 loopOp++; // point LoopOp after the just-inserted STO_SP | 1087 loopOp++; // point LoopOp after the just-inserted STO_SP |
1077 fRXPat->fCompiledPat->push(loopOp, *fStatus); | 1088 fRXPat->fCompiledPat->push(loopOp, *fStatus); |
1078 | 1089 |
1079 // Then the LD_SP after the end of the loop | 1090 // Then the LD_SP after the end of the loop |
1080 op = URX_BUILD(URX_LD_SP, varLoc); | 1091 op = URX_BUILD(URX_LD_SP, varLoc); |
1081 appendOp(op); | 1092 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1082 } | 1093 } |
1083 | 1094 |
1084 break; | 1095 break; |
1085 | 1096 |
1086 case doNGInterval: | 1097 case doNGInterval: |
1087 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. | 1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the
code for it. |
1088 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); | 1099 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); |
1089 break; | 1100 break; |
1090 | 1101 |
1091 case doIntervalError: | 1102 case doIntervalError: |
(...skipping 23 matching lines...) Expand all Loading... |
1115 { | 1126 { |
1116 fixLiterals(FALSE); | 1127 fixLiterals(FALSE); |
1117 int32_t op; | 1128 int32_t op; |
1118 if (fModeFlags & UREGEX_DOTALL) { | 1129 if (fModeFlags & UREGEX_DOTALL) { |
1119 op = URX_BUILD(URX_DOTANY_ALL, 0); | 1130 op = URX_BUILD(URX_DOTANY_ALL, 0); |
1120 } else if (fModeFlags & UREGEX_UNIX_LINES) { | 1131 } else if (fModeFlags & UREGEX_UNIX_LINES) { |
1121 op = URX_BUILD(URX_DOTANY_UNIX, 0); | 1132 op = URX_BUILD(URX_DOTANY_UNIX, 0); |
1122 } else { | 1133 } else { |
1123 op = URX_BUILD(URX_DOTANY, 0); | 1134 op = URX_BUILD(URX_DOTANY, 0); |
1124 } | 1135 } |
1125 appendOp(op); | 1136 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1126 } | 1137 } |
1127 break; | 1138 break; |
1128 | 1139 |
1129 case doCaret: | 1140 case doCaret: |
1130 { | 1141 { |
1131 fixLiterals(FALSE); | 1142 fixLiterals(FALSE); |
1132 int32_t op = 0; | 1143 int32_t op = 0; |
1133 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1144 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1134 op = URX_CARET; | 1145 op = URX_CARET; |
1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1146 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1136 op = URX_CARET_M; | 1147 op = URX_CARET_M; |
1137 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1148 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1138 op = URX_CARET; // Only testing true start of input. | 1149 op = URX_CARET; // Only testing true start of input. |
1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1140 op = URX_CARET_M_UNIX; | 1151 op = URX_CARET_M_UNIX; |
1141 } | 1152 } |
1142 appendOp(URX_BUILD(op, 0)); | 1153 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
1143 } | 1154 } |
1144 break; | 1155 break; |
1145 | 1156 |
1146 case doDollar: | 1157 case doDollar: |
1147 { | 1158 { |
1148 fixLiterals(FALSE); | 1159 fixLiterals(FALSE); |
1149 int32_t op = 0; | 1160 int32_t op = 0; |
1150 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1161 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1151 op = URX_DOLLAR; | 1162 op = URX_DOLLAR; |
1152 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { | 1163 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) == 0) { |
1153 op = URX_DOLLAR_M; | 1164 op = URX_DOLLAR_M; |
1154 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1165 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1155 op = URX_DOLLAR_D; | 1166 op = URX_DOLLAR_D; |
1156 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { | 1167 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE
GEX_UNIX_LINES) != 0) { |
1157 op = URX_DOLLAR_MD; | 1168 op = URX_DOLLAR_MD; |
1158 } | 1169 } |
1159 appendOp(URX_BUILD(op, 0)); | 1170 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
1160 } | 1171 } |
1161 break; | 1172 break; |
1162 | 1173 |
1163 case doBackslashA: | 1174 case doBackslashA: |
1164 fixLiterals(FALSE); | 1175 fixLiterals(FALSE); |
1165 appendOp(URX_BUILD(URX_CARET, 0)); | 1176 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); |
1166 break; | 1177 break; |
1167 | 1178 |
1168 case doBackslashB: | 1179 case doBackslashB: |
1169 { | 1180 { |
1170 #if UCONFIG_NO_BREAK_ITERATION==1 | 1181 #if UCONFIG_NO_BREAK_ITERATION==1 |
1171 if (fModeFlags & UREGEX_UWORD) { | 1182 if (fModeFlags & UREGEX_UWORD) { |
1172 error(U_UNSUPPORTED_ERROR); | 1183 error(U_UNSUPPORTED_ERROR); |
1173 } | 1184 } |
1174 #endif | 1185 #endif |
1175 fixLiterals(FALSE); | 1186 fixLiterals(FALSE); |
1176 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1187 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
1177 appendOp(URX_BUILD(op, 1)); | 1188 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); |
1178 } | 1189 } |
1179 break; | 1190 break; |
1180 | 1191 |
1181 case doBackslashb: | 1192 case doBackslashb: |
1182 { | 1193 { |
1183 #if UCONFIG_NO_BREAK_ITERATION==1 | 1194 #if UCONFIG_NO_BREAK_ITERATION==1 |
1184 if (fModeFlags & UREGEX_UWORD) { | 1195 if (fModeFlags & UREGEX_UWORD) { |
1185 error(U_UNSUPPORTED_ERROR); | 1196 error(U_UNSUPPORTED_ERROR); |
1186 } | 1197 } |
1187 #endif | 1198 #endif |
1188 fixLiterals(FALSE); | 1199 fixLiterals(FALSE); |
1189 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; | 1200 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC
KSLASH_B; |
1190 appendOp(URX_BUILD(op, 0)); | 1201 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
1191 } | 1202 } |
1192 break; | 1203 break; |
1193 | 1204 |
1194 case doBackslashD: | 1205 case doBackslashD: |
1195 fixLiterals(FALSE); | 1206 fixLiterals(FALSE); |
1196 appendOp(URX_BUILD(URX_BACKSLASH_D, 1)); | 1207 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus
); |
1197 break; | 1208 break; |
1198 | 1209 |
1199 case doBackslashd: | 1210 case doBackslashd: |
1200 fixLiterals(FALSE); | 1211 fixLiterals(FALSE); |
1201 appendOp(URX_BUILD(URX_BACKSLASH_D, 0)); | 1212 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus
); |
1202 break; | 1213 break; |
1203 | 1214 |
1204 case doBackslashG: | 1215 case doBackslashG: |
1205 fixLiterals(FALSE); | 1216 fixLiterals(FALSE); |
1206 appendOp(URX_BUILD(URX_BACKSLASH_G, 0)); | 1217 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus
); |
1207 break; | 1218 break; |
1208 | 1219 |
1209 case doBackslashS: | 1220 case doBackslashS: |
1210 fixLiterals(FALSE); | 1221 fixLiterals(FALSE); |
1211 appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET)); | 1222 fRXPat->fCompiledPat->addElement( |
| 1223 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); |
1212 break; | 1224 break; |
1213 | 1225 |
1214 case doBackslashs: | 1226 case doBackslashs: |
1215 fixLiterals(FALSE); | 1227 fixLiterals(FALSE); |
1216 appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET)); | 1228 fRXPat->fCompiledPat->addElement( |
| 1229 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); |
1217 break; | 1230 break; |
1218 | 1231 |
1219 case doBackslashW: | 1232 case doBackslashW: |
1220 fixLiterals(FALSE); | 1233 fixLiterals(FALSE); |
1221 appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET)); | 1234 fRXPat->fCompiledPat->addElement( |
| 1235 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); |
1222 break; | 1236 break; |
1223 | 1237 |
1224 case doBackslashw: | 1238 case doBackslashw: |
1225 fixLiterals(FALSE); | 1239 fixLiterals(FALSE); |
1226 appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET)); | 1240 fRXPat->fCompiledPat->addElement( |
| 1241 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); |
1227 break; | 1242 break; |
1228 | 1243 |
1229 case doBackslashX: | 1244 case doBackslashX: |
1230 fixLiterals(FALSE); | 1245 fixLiterals(FALSE); |
1231 appendOp(URX_BUILD(URX_BACKSLASH_X, 0)); | 1246 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus
); |
1232 break; | 1247 break; |
1233 | 1248 |
1234 | 1249 |
1235 case doBackslashZ: | 1250 case doBackslashZ: |
1236 fixLiterals(FALSE); | 1251 fixLiterals(FALSE); |
1237 appendOp(URX_BUILD(URX_DOLLAR, 0)); | 1252 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); |
1238 break; | 1253 break; |
1239 | 1254 |
1240 case doBackslashz: | 1255 case doBackslashz: |
1241 fixLiterals(FALSE); | 1256 fixLiterals(FALSE); |
1242 appendOp(URX_BUILD(URX_BACKSLASH_Z, 0)); | 1257 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus
); |
1243 break; | 1258 break; |
1244 | 1259 |
1245 case doEscapeError: | 1260 case doEscapeError: |
1246 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 1261 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
1247 break; | 1262 break; |
1248 | 1263 |
1249 case doExit: | 1264 case doExit: |
1250 fixLiterals(FALSE); | 1265 fixLiterals(FALSE); |
1251 returnVal = FALSE; | 1266 returnVal = FALSE; |
1252 break; | 1267 break; |
1253 | 1268 |
1254 case doProperty: | 1269 case doProperty: |
1255 { | 1270 { |
1256 fixLiterals(FALSE); | 1271 fixLiterals(FALSE); |
1257 UnicodeSet *theSet = scanProp(); | 1272 UnicodeSet *theSet = scanProp(); |
1258 compileSet(theSet); | 1273 compileSet(theSet); |
1259 } | 1274 } |
1260 break; | 1275 break; |
1261 | 1276 |
1262 case doNamedChar: | 1277 case doNamedChar: |
1263 { | 1278 { |
1264 UChar32 c = scanNamedChar(); | 1279 UChar32 c = scanNamedChar(); |
1265 literalChar(c); | 1280 literalChar(c); |
1266 } | 1281 } |
1267 break; | 1282 break; |
1268 | 1283 |
1269 | 1284 |
1270 case doBackRef: | 1285 case doBackRef: |
1271 // BackReference. Somewhat unusual in that the front-end can not comple
tely parse | 1286 // BackReference. Somewhat unusual in that the front-end can not comple
tely parse |
1272 // the regular expression, because the number of digits
to be consumed | 1287 // the regular expression, because the number of digits
to be consumed |
1273 // depends on the number of capture groups that have bee
n defined. So | 1288 // depends on the number of capture groups that have bee
n defined. So |
1274 // we have to do it here instead. | 1289 // we have to do it here instead. |
1275 { | 1290 { |
1276 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); | 1291 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); |
1277 int32_t groupNum = 0; | 1292 int32_t groupNum = 0; |
1278 UChar32 c = fC.fChar; | 1293 UChar32 c = fC.fChar; |
(...skipping 19 matching lines...) Expand all Loading... |
1298 // of compilation, it will be changed to the variable's location. | 1313 // of compilation, it will be changed to the variable's location. |
1299 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, | 1314 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, |
1300 // and shouldn't enter this code path at
all. | 1315 // and shouldn't enter this code path at
all. |
1301 fixLiterals(FALSE); | 1316 fixLiterals(FALSE); |
1302 int32_t op; | 1317 int32_t op; |
1303 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1318 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1304 op = URX_BUILD(URX_BACKREF_I, groupNum); | 1319 op = URX_BUILD(URX_BACKREF_I, groupNum); |
1305 } else { | 1320 } else { |
1306 op = URX_BUILD(URX_BACKREF, groupNum); | 1321 op = URX_BUILD(URX_BACKREF, groupNum); |
1307 } | 1322 } |
1308 appendOp(op); | 1323 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1309 } | 1324 } |
1310 break; | 1325 break; |
1311 | 1326 |
1312 | 1327 |
1313 case doPossessivePlus: | 1328 case doPossessivePlus: |
1314 // Possessive ++ quantifier. | 1329 // Possessive ++ quantifier. |
1315 // Compiles to | 1330 // Compiles to |
1316 // 1. STO_SP | 1331 // 1. STO_SP |
1317 // 2. body of stuff being iterated over | 1332 // 2. body of stuff being iterated over |
1318 // 3. STATE_SAVE 5 | 1333 // 3. STATE_SAVE 5 |
1319 // 4. JMP 2 | 1334 // 4. JMP 2 |
1320 // 5. LD_SP | 1335 // 5. LD_SP |
1321 // 6. ... | 1336 // 6. ... |
1322 // | 1337 // |
1323 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up | 1338 // Note: TODO: This is pretty inefficient. A mass of saved state is
built up |
1324 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 | 1339 // then unconditionally discarded. Perhaps introduce a n
ew opcode. Ticket 6056 |
1325 // | 1340 // |
1326 { | 1341 { |
1327 // Emit the STO_SP | 1342 // Emit the STO_SP |
1328 int32_t topLoc = blockTopLoc(TRUE); | 1343 int32_t topLoc = blockTopLoc(TRUE); |
1329 int32_t stoLoc = allocateData(1); // Reserve the data location fo
r storing save stack ptr. | 1344 int32_t stoLoc = fRXPat->fDataSize; |
| 1345 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
1330 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | 1346 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
1331 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1347 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1332 | 1348 |
1333 // Emit the STATE_SAVE | 1349 // Emit the STATE_SAVE |
1334 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | 1350 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
1335 appendOp(op); | 1351 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1336 | 1352 |
1337 // Emit the JMP | 1353 // Emit the JMP |
1338 op = URX_BUILD(URX_JMP, topLoc+1); | 1354 op = URX_BUILD(URX_JMP, topLoc+1); |
1339 appendOp(op); | 1355 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1340 | 1356 |
1341 // Emit the LD_SP | 1357 // Emit the LD_SP |
1342 op = URX_BUILD(URX_LD_SP, stoLoc); | 1358 op = URX_BUILD(URX_LD_SP, stoLoc); |
1343 appendOp(op); | 1359 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1344 } | 1360 } |
1345 break; | 1361 break; |
1346 | 1362 |
1347 case doPossessiveStar: | 1363 case doPossessiveStar: |
1348 // Possessive *+ quantifier. | 1364 // Possessive *+ quantifier. |
1349 // Compiles to | 1365 // Compiles to |
1350 // 1. STO_SP loc | 1366 // 1. STO_SP loc |
1351 // 2. STATE_SAVE 5 | 1367 // 2. STATE_SAVE 5 |
1352 // 3. body of stuff being iterated over | 1368 // 3. body of stuff being iterated over |
1353 // 4. JMP 2 | 1369 // 4. JMP 2 |
1354 // 5. LD_SP loc | 1370 // 5. LD_SP loc |
1355 // 6 ... | 1371 // 6 ... |
1356 // TODO: do something to cut back the state stack each time through the
loop. | 1372 // TODO: do something to cut back the state stack each time through the
loop. |
1357 { | 1373 { |
1358 // Reserve two slots at the top of the block. | 1374 // Reserve two slots at the top of the block. |
1359 int32_t topLoc = blockTopLoc(TRUE); | 1375 int32_t topLoc = blockTopLoc(TRUE); |
1360 insertOp(topLoc); | 1376 insertOp(topLoc); |
1361 | 1377 |
1362 // emit STO_SP loc | 1378 // emit STO_SP loc |
1363 int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. | 1379 int32_t stoLoc = fRXPat->fDataSize; |
| 1380 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
1364 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | 1381 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
1365 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1382 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1366 | 1383 |
1367 // Emit the SAVE_STATE 5 | 1384 // Emit the SAVE_STATE 5 |
1368 int32_t L7 = fRXPat->fCompiledPat->size()+1; | 1385 int32_t L7 = fRXPat->fCompiledPat->size()+1; |
1369 op = URX_BUILD(URX_STATE_SAVE, L7); | 1386 op = URX_BUILD(URX_STATE_SAVE, L7); |
1370 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1387 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
1371 | 1388 |
1372 // Append the JMP operation. | 1389 // Append the JMP operation. |
1373 op = URX_BUILD(URX_JMP, topLoc+1); | 1390 op = URX_BUILD(URX_JMP, topLoc+1); |
1374 appendOp(op); | 1391 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1375 | 1392 |
1376 // Emit the LD_SP loc | 1393 // Emit the LD_SP loc |
1377 op = URX_BUILD(URX_LD_SP, stoLoc); | 1394 op = URX_BUILD(URX_LD_SP, stoLoc); |
1378 appendOp(op); | 1395 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1379 } | 1396 } |
1380 break; | 1397 break; |
1381 | 1398 |
1382 case doPossessiveOpt: | 1399 case doPossessiveOpt: |
1383 // Possessive ?+ quantifier. | 1400 // Possessive ?+ quantifier. |
1384 // Compiles to | 1401 // Compiles to |
1385 // 1. STO_SP loc | 1402 // 1. STO_SP loc |
1386 // 2. SAVE_STATE 5 | 1403 // 2. SAVE_STATE 5 |
1387 // 3. body of optional block | 1404 // 3. body of optional block |
1388 // 4. LD_SP loc | 1405 // 4. LD_SP loc |
1389 // 5. ... | 1406 // 5. ... |
1390 // | 1407 // |
1391 { | 1408 { |
1392 // Reserve two slots at the top of the block. | 1409 // Reserve two slots at the top of the block. |
1393 int32_t topLoc = blockTopLoc(TRUE); | 1410 int32_t topLoc = blockTopLoc(TRUE); |
1394 insertOp(topLoc); | 1411 insertOp(topLoc); |
1395 | 1412 |
1396 // Emit the STO_SP | 1413 // Emit the STO_SP |
1397 int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. | 1414 int32_t stoLoc = fRXPat->fDataSize; |
| 1415 fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
1398 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | 1416 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
1399 fRXPat->fCompiledPat->setElementAt(op, topLoc); | 1417 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
1400 | 1418 |
1401 // Emit the SAVE_STATE | 1419 // Emit the SAVE_STATE |
1402 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | 1420 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
1403 op = URX_BUILD(URX_STATE_SAVE, continueLoc); | 1421 op = URX_BUILD(URX_STATE_SAVE, continueLoc); |
1404 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | 1422 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
1405 | 1423 |
1406 // Emit the LD_SP | 1424 // Emit the LD_SP |
1407 op = URX_BUILD(URX_LD_SP, stoLoc); | 1425 op = URX_BUILD(URX_LD_SP, stoLoc); |
1408 appendOp(op); | 1426 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1409 } | 1427 } |
1410 break; | 1428 break; |
1411 | 1429 |
1412 | 1430 |
1413 case doBeginMatchMode: | 1431 case doBeginMatchMode: |
1414 fNewModeFlags = fModeFlags; | 1432 fNewModeFlags = fModeFlags; |
1415 fSetModeFlag = TRUE; | 1433 fSetModeFlag = TRUE; |
1416 break; | 1434 break; |
1417 | 1435 |
1418 case doMatchMode: // (?i) and similar | 1436 case doMatchMode: // (?i) and similar |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1455 // We've got a (?i: or similar. Begin a parenthesized block, save old | 1473 // We've got a (?i: or similar. Begin a parenthesized block, save old |
1456 // mode flags so they can be restored at the close of the block. | 1474 // mode flags so they can be restored at the close of the block. |
1457 // | 1475 // |
1458 // Compile to a | 1476 // Compile to a |
1459 // - NOP, which later may be replaced by a save-state if the | 1477 // - NOP, which later may be replaced by a save-state if the |
1460 // parenthesized group gets a * quantifier, followed by | 1478 // parenthesized group gets a * quantifier, followed by |
1461 // - NOP, which may later be replaced by a save-state if there | 1479 // - NOP, which may later be replaced by a save-state if there |
1462 // is an '|' alternation within the parens. | 1480 // is an '|' alternation within the parens. |
1463 { | 1481 { |
1464 fixLiterals(FALSE); | 1482 fixLiterals(FALSE); |
1465 appendOp(URX_BUILD(URX_NOP, 0)); | 1483 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
1466 appendOp(URX_BUILD(URX_NOP, 0)); | 1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
1467 | 1485 |
1468 // On the Parentheses stack, start a new frame and add the postions | 1486 // On the Parentheses stack, start a new frame and add the postions |
1469 // of the two NOPs (a normal non-capturing () frame, except for th
e | 1487 // of the two NOPs (a normal non-capturing () frame, except for th
e |
1470 // saving of the orignal mode flags.) | 1488 // saving of the orignal mode flags.) |
1471 fParenStack.push(fModeFlags, *fStatus); | 1489 fParenStack.push(fModeFlags, *fStatus); |
1472 fParenStack.push(flags, *fStatus); // Fra
me Marker | 1490 fParenStack.push(flags, *fStatus); // Fra
me Marker |
1473 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP | 1491 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The
first NOP |
1474 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP | 1492 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The
second NOP |
1475 | 1493 |
1476 // Set the current mode flags to the new values. | 1494 // Set the current mode flags to the new values. |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1617 // Finished a complete set expression, including all nested sets. | 1635 // Finished a complete set expression, including all nested sets. |
1618 // The close bracket has already triggered clearing out pending set op
erators, | 1636 // The close bracket has already triggered clearing out pending set op
erators, |
1619 // the operator stack should be empty and the operand stack should ha
ve just | 1637 // the operator stack should be empty and the operand stack should ha
ve just |
1620 // one entry, the result set. | 1638 // one entry, the result set. |
1621 U_ASSERT(fSetOpStack.empty()); | 1639 U_ASSERT(fSetOpStack.empty()); |
1622 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); | 1640 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); |
1623 U_ASSERT(fSetStack.empty()); | 1641 U_ASSERT(fSetStack.empty()); |
1624 compileSet(theSet); | 1642 compileSet(theSet); |
1625 break; | 1643 break; |
1626 } | 1644 } |
1627 | 1645 |
1628 case doSetIntersection2: | 1646 case doSetIntersection2: |
1629 // Have scanned something like [abc&& | 1647 // Have scanned something like [abc&& |
1630 setPushOp(setIntersection2); | 1648 setPushOp(setIntersection2); |
1631 break; | 1649 break; |
1632 | 1650 |
1633 case doSetLiteral: | 1651 case doSetLiteral: |
1634 // Union the just-scanned literal character into the set being built. | 1652 // Union the just-scanned literal character into the set being built. |
1635 // This operation is the highest precedence set operation, so we can
always do | 1653 // This operation is the highest precedence set operation, so we can
always do |
1636 // it immediately, without waiting to see what follows. It is necess
ary to perform | 1654 // it immediately, without waiting to see what follows. It is necess
ary to perform |
1637 // any pending '-' or '&' operation first, because these have the sam
e precedence | 1655 // any pending '-' or '&' operation first, because these have the sam
e precedence |
1638 // as union-ing in a literal' | 1656 // as union-ing in a literal' |
1639 { | 1657 { |
1640 setEval(setUnion); | 1658 setEval(setUnion); |
1641 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); | 1659 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
1642 s->add(fC.fChar); | 1660 s->add(fC.fChar); |
1643 fLastSetLiteral = fC.fChar; | 1661 fLastSetLiteral = fC.fChar; |
1644 break; | 1662 break; |
1645 } | 1663 } |
1646 | 1664 |
1647 case doSetLiteralEscaped: | 1665 case doSetLiteralEscaped: |
1648 // A back-slash escaped literal character was encountered. | 1666 // A back-slash escaped literal character was encountered. |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1723 case doSetPosixProp: | 1741 case doSetPosixProp: |
1724 { | 1742 { |
1725 UnicodeSet *s = scanPosixProp(); | 1743 UnicodeSet *s = scanPosixProp(); |
1726 if (s != NULL) { | 1744 if (s != NULL) { |
1727 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); | 1745 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); |
1728 tos->addAll(*s); | 1746 tos->addAll(*s); |
1729 delete s; | 1747 delete s; |
1730 } // else error. scanProp() reported the error status already. | 1748 } // else error. scanProp() reported the error status already. |
1731 } | 1749 } |
1732 break; | 1750 break; |
1733 | 1751 |
1734 case doSetProp: | 1752 case doSetProp: |
1735 // Scanned a \p \P within [brackets]. | 1753 // Scanned a \p \P within [brackets]. |
1736 { | 1754 { |
1737 UnicodeSet *s = scanProp(); | 1755 UnicodeSet *s = scanProp(); |
1738 if (s != NULL) { | 1756 if (s != NULL) { |
1739 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); | 1757 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); |
1740 tos->addAll(*s); | 1758 tos->addAll(*s); |
1741 delete s; | 1759 delete s; |
1742 } // else error. scanProp() reported the error status already. | 1760 } // else error. scanProp() reported the error status already. |
1743 } | 1761 } |
1744 break; | 1762 break; |
1745 | 1763 |
1746 | 1764 |
1747 case doSetRange: | 1765 case doSetRange: |
1748 // We have scanned literal-literal. Add the range to the set. | 1766 // We have scanned literal-literal. Add the range to the set. |
1749 // The left character is already in the set, and is saved in fLastSetLit
eral. | 1767 // The left character is already in the set, and is saved in fLastSetLit
eral. |
1750 // The right side is the current character. | 1768 // The right side is the current character. |
1751 // Lower Limit > Upper limit being an error matches both Java | 1769 // Lower Limit > Upper limit being an error matches both Java |
1752 // and ICU UnicodeSet behavior. | 1770 // and ICU UnicodeSet behavior. |
1753 { | 1771 { |
1754 if (fLastSetLiteral > fC.fChar) { | 1772 if (fLastSetLiteral > fC.fChar) { |
1755 error(U_REGEX_INVALID_RANGE); | 1773 error(U_REGEX_INVALID_RANGE); |
1756 } | 1774 } |
1757 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); | 1775 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); |
1758 s->add(fLastSetLiteral, fC.fChar); | 1776 s->add(fLastSetLiteral, fC.fChar); |
1759 break; | 1777 break; |
1760 } | 1778 } |
1761 | 1779 |
1762 default: | 1780 default: |
1763 U_ASSERT(FALSE); | 1781 U_ASSERT(FALSE); |
1764 error(U_REGEX_INTERNAL_ERROR); | 1782 error(U_REGEX_INTERNAL_ERROR); |
1765 break; | 1783 break; |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1804 | 1822 |
1805 // If no literal characters have been scanned but not yet had code generated | 1823 // If no literal characters have been scanned but not yet had code generated |
1806 // for them, nothing needs to be done. | 1824 // for them, nothing needs to be done. |
1807 if (fLiteralChars.length() == 0) { | 1825 if (fLiteralChars.length() == 0) { |
1808 return; | 1826 return; |
1809 } | 1827 } |
1810 | 1828 |
1811 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); | 1829 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt
h(), -1); |
1812 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1830 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
1813 | 1831 |
1814 // Split: We need to ensure that the last item in the compiled pattern | 1832 // Split: We need to ensure that the last item in the compiled pattern |
1815 // refers only to the last literal scanned in the pattern, so that | 1833 // refers only to the last literal scanned in the pattern, so that |
1816 // quantifiers (*, +, etc.) affect only it, and not a longer string. | 1834 // quantifiers (*, +, etc.) affect only it, and not a longer string. |
1817 // Split before case folding for case insensitive matches. | 1835 // Split before case folding for case insensitive matches. |
1818 | 1836 |
1819 if (split) { | 1837 if (split) { |
1820 fLiteralChars.truncate(indexOfLastCodePoint); | 1838 fLiteralChars.truncate(indexOfLastCodePoint); |
1821 fixLiterals(FALSE); // Recursive call, emit code to match the first pa
rt of the string. | 1839 fixLiterals(FALSE); // Recursive call, emit code to match the first pa
rt of the string. |
1822 // Note that the truncated literal string may be
empty, in which case | 1840 // Note that the truncated literal string may be
empty, in which case |
1823 // nothing will be emitted. | 1841 // nothing will be emitted. |
1824 | 1842 |
1825 literalChar(lastCodePoint); // Re-add the last code point as if it were
a new literal. | 1843 literalChar(lastCodePoint); // Re-add the last code point as if it were
a new literal. |
1826 fixLiterals(FALSE); // Second recursive call, code for the fina
l code point. | 1844 fixLiterals(FALSE); // Second recursive call, code for the fina
l code point. |
1827 return; | 1845 return; |
1828 } | 1846 } |
1829 | 1847 |
1830 // If we are doing case-insensitive matching, case fold the string. This ma
y expand | 1848 // If we are doing case-insensitive matching, case fold the string. This ma
y expand |
1831 // the string, e.g. the German sharp-s turns into "ss" | 1849 // the string, e.g. the German sharp-s turns into "ss" |
1832 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1850 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1833 fLiteralChars.foldCase(); | 1851 fLiteralChars.foldCase(); |
1834 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); | 1852 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(),
-1); |
1835 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); | 1853 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); |
1836 } | 1854 } |
1837 | 1855 |
1838 if (indexOfLastCodePoint == 0) { | 1856 if (indexOfLastCodePoint == 0) { |
1839 // Single character, emit a URX_ONECHAR op to match it. | 1857 // Single character, emit a URX_ONECHAR op to match it. |
1840 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | 1858 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && |
1841 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { | 1859 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { |
1842 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); | 1860 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); |
1843 } else { | 1861 } else { |
1844 op = URX_BUILD(URX_ONECHAR, lastCodePoint); | 1862 op = URX_BUILD(URX_ONECHAR, lastCodePoint); |
1845 } | 1863 } |
1846 appendOp(op); | 1864 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1847 } else { | 1865 } else { |
1848 // Two or more chars, emit a URX_STRING to match them. | 1866 // Two or more chars, emit a URX_STRING to match them. |
1849 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 1867 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
1850 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); | 1868 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); |
1851 } else { | 1869 } else { |
1852 // TODO here: add optimization to split case sensitive strings of l
ength two | 1870 // TODO here: add optimization to split case sensitive strings of l
ength two |
1853 // into two single char ops, for efficiency. | 1871 // into two single char ops, for efficiency. |
1854 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); | 1872 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); |
1855 } | 1873 } |
1856 appendOp(op); | 1874 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1857 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); | 1875 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); |
1858 appendOp(op); | 1876 fRXPat->fCompiledPat->addElement(op, *fStatus); |
1859 | 1877 |
1860 // Add this string into the accumulated strings of the compiled pattern. | 1878 // Add this string into the accumulated strings of the compiled pattern. |
1861 // The total size of the accumulated strings must be restricted to 24 bi
ts because | |
1862 // string indexes appear as compiled pattern operand values. | |
1863 // This is the only place that the pattern.fLiteralText string is modifi
ed. | |
1864 | |
1865 fRXPat->fLiteralText.append(fLiteralChars); | 1879 fRXPat->fLiteralText.append(fLiteralChars); |
1866 if (U_SUCCESS(*fStatus) && fRXPat->fLiteralText.length() > 0x00ffffff) { | |
1867 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
1868 } | |
1869 } | 1880 } |
1870 | 1881 |
1871 fLiteralChars.remove(); | 1882 fLiteralChars.remove(); |
1872 } | 1883 } |
1873 | 1884 |
1874 | 1885 |
1875 //------------------------------------------------------------------------------ | 1886 |
1876 // | 1887 |
1877 // appendOp() Append a new instruction onto the compiled pattern | |
1878 // Includes error checking, limiting the size of the | |
1879 // pattern to lengths that can be represented in the | |
1880 // 24 bit operand field of an instruction. | |
1881 // | |
1882 //------------------------------------------------------------------------------ | |
1883 void RegexCompile::appendOp(int32_t op) { | |
1884 fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1885 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { | |
1886 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
1887 } | |
1888 } | |
1889 | 1888 |
1890 | 1889 |
1891 //------------------------------------------------------------------------------ | 1890 //------------------------------------------------------------------------------ |
1892 // | 1891 // |
1893 // insertOp() Insert a slot for a new opcode into the already | 1892 // insertOp() Insert a slot for a new opcode into the already |
1894 // compiled pattern code. | 1893 // compiled pattern code. |
1895 // | 1894 // |
1896 // Fill the slot with a NOP. Our caller will replace i
t | 1895 // Fill the slot with a NOP. Our caller will replace i
t |
1897 // with what they really wanted. | 1896 // with what they really wanted. |
1898 // | 1897 // |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1940 | 1939 |
1941 if (fMatchCloseParen > where) { | 1940 if (fMatchCloseParen > where) { |
1942 fMatchCloseParen++; | 1941 fMatchCloseParen++; |
1943 } | 1942 } |
1944 if (fMatchOpenParen > where) { | 1943 if (fMatchOpenParen > where) { |
1945 fMatchOpenParen++; | 1944 fMatchOpenParen++; |
1946 } | 1945 } |
1947 } | 1946 } |
1948 | 1947 |
1949 | 1948 |
| 1949 |
1950 //------------------------------------------------------------------------------ | 1950 //------------------------------------------------------------------------------ |
1951 // | 1951 // |
1952 // allocateData() Allocate storage in the matcher's static data area. | |
1953 // Return the index for the newly allocated data. | |
1954 // The storage won't actually exist until we are running
a match | |
1955 // operation, but the storage indexes are inserted into
various | |
1956 // opcodes while compiling the pattern. | |
1957 // | |
1958 //------------------------------------------------------------------------------ | |
1959 int32_t RegexCompile::allocateData(int32_t size) { | |
1960 if (U_FAILURE(*fStatus)) { | |
1961 return 0; | |
1962 } | |
1963 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { | |
1964 *fStatus = U_REGEX_INTERNAL_ERROR; | |
1965 return 0; | |
1966 } | |
1967 int32_t dataIndex = fRXPat->fDataSize; | |
1968 fRXPat->fDataSize += size; | |
1969 if (fRXPat->fDataSize >= 0x00fffff0) { | |
1970 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
1971 } | |
1972 return dataIndex; | |
1973 } | |
1974 | |
1975 | |
1976 //------------------------------------------------------------------------------ | |
1977 // | |
1978 // allocateStackData() Allocate space in the back-tracking stack frame. | |
1979 // Return the index for the newly allocated data. | |
1980 // The frame indexes are inserted into various | |
1981 // opcodes while compiling the pattern, meaning that fra
me | |
1982 // size must be restricted to the size that will fit | |
1983 // as an operand (24 bits). | |
1984 // | |
1985 //------------------------------------------------------------------------------ | |
1986 int32_t RegexCompile::allocateStackData(int32_t size) { | |
1987 if (U_FAILURE(*fStatus)) { | |
1988 return 0; | |
1989 } | |
1990 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { | |
1991 *fStatus = U_REGEX_INTERNAL_ERROR; | |
1992 return 0; | |
1993 } | |
1994 int32_t dataIndex = fRXPat->fFrameSize; | |
1995 fRXPat->fFrameSize += size; | |
1996 if (fRXPat->fFrameSize >= 0x00fffff0) { | |
1997 *fStatus = U_REGEX_PATTERN_TOO_BIG; | |
1998 } | |
1999 return dataIndex; | |
2000 } | |
2001 | |
2002 | |
2003 //------------------------------------------------------------------------------ | |
2004 // | |
2005 // blockTopLoc() Find or create a location in the compiled pattern | 1952 // blockTopLoc() Find or create a location in the compiled pattern |
2006 // at the start of the operation or block that has | 1953 // at the start of the operation or block that has |
2007 // just been compiled. Needed when a quantifier (* or | 1954 // just been compiled. Needed when a quantifier (* or |
2008 // whatever) appears, and we need to add an operation | 1955 // whatever) appears, and we need to add an operation |
2009 // at the start of the thing being quantified. | 1956 // at the start of the thing being quantified. |
2010 // | 1957 // |
2011 // (Parenthesized Blocks) have a slot with a NOP that | 1958 // (Parenthesized Blocks) have a slot with a NOP that |
2012 // is reserved for this purpose. .* or similar don't | 1959 // is reserved for this purpose. .* or similar don't |
2013 // and a slot needs to be added. | 1960 // and a slot needs to be added. |
2014 // | 1961 // |
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2110 // Capturing Parentheses. | 2057 // Capturing Parentheses. |
2111 // Insert a End Capture op into the pattern. | 2058 // Insert a End Capture op into the pattern. |
2112 // The frame offset of the variables for this cg is obtained from the | 2059 // The frame offset of the variables for this cg is obtained from the |
2113 // start capture op and put it into the end-capture op. | 2060 // start capture op and put it into the end-capture op. |
2114 { | 2061 { |
2115 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); | 2062 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat
chOpenParen+1); |
2116 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | 2063 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
2117 | 2064 |
2118 int32_t frameVarLocation = URX_VAL(captureOp); | 2065 int32_t frameVarLocation = URX_VAL(captureOp); |
2119 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); | 2066 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation
); |
2120 appendOp(endCaptureOp); | 2067 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); |
2121 } | 2068 } |
2122 break; | 2069 break; |
2123 case atomic: | 2070 case atomic: |
2124 // Atomic Parenthesis. | 2071 // Atomic Parenthesis. |
2125 // Insert a LD_SP operation to restore the state stack to the position | 2072 // Insert a LD_SP operation to restore the state stack to the position |
2126 // it was when the atomic parens were entered. | 2073 // it was when the atomic parens were entered. |
2127 { | 2074 { |
2128 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); | 2075 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp
enParen+1); |
2129 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | 2076 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); |
2130 int32_t stoLoc = URX_VAL(stoOp); | 2077 int32_t stoLoc = URX_VAL(stoOp); |
2131 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); | 2078 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); |
2132 appendOp(ldOp); | 2079 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); |
2133 } | 2080 } |
2134 break; | 2081 break; |
2135 | 2082 |
2136 case lookAhead: | 2083 case lookAhead: |
2137 { | 2084 { |
2138 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2085 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
2139 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2086 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
2140 int32_t dataLoc = URX_VAL(startOp); | 2087 int32_t dataLoc = URX_VAL(startOp); |
2141 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2088 int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
2142 appendOp(op); | 2089 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2143 } | 2090 } |
2144 break; | 2091 break; |
2145 | 2092 |
2146 case negLookAhead: | 2093 case negLookAhead: |
2147 { | 2094 { |
2148 // See comment at doOpenLookAheadNeg | 2095 // See comment at doOpenLookAheadNeg |
2149 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); | 2096 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-1); |
2150 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | 2097 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
2151 int32_t dataLoc = URX_VAL(startOp); | 2098 int32_t dataLoc = URX_VAL(startOp); |
2152 int32_t op = URX_BUILD(URX_LA_END, dataLoc); | 2099 int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
2153 appendOp(op); | 2100 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2154 op = URX_BUILD(URX_BACKTRACK, 0); | 2101 op = URX_BUILD(URX_BACKTRACK, 0); |
2155 appendOp(op); | 2102 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2156 op = URX_BUILD(URX_LA_END, dataLoc); | 2103 op = URX_BUILD(URX_LA_END, dataLoc); |
2157 appendOp(op); | 2104 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2158 | 2105 |
2159 // Patch the URX_SAVE near the top of the block. | 2106 // Patch the URX_SAVE near the top of the block. |
2160 // The destination of the SAVE is the final LA_END that was just add
ed. | 2107 // The destination of the SAVE is the final LA_END that was just add
ed. |
2161 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); | 2108 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen); |
2162 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | 2109 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); |
2163 int32_t dest = fRXPat->fCompiledPat->size()-1; | 2110 int32_t dest = fRXPat->fCompiledPat->size()-1; |
2164 saveOp = URX_BUILD(URX_STATE_SAVE, dest); | 2111 saveOp = URX_BUILD(URX_STATE_SAVE, dest); |
2165 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | 2112 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); |
2166 } | 2113 } |
2167 break; | 2114 break; |
2168 | 2115 |
2169 case lookBehind: | 2116 case lookBehind: |
2170 { | 2117 { |
2171 // See comment at doOpenLookBehind. | 2118 // See comment at doOpenLookBehind. |
2172 | 2119 |
2173 // Append the URX_LB_END and URX_LA_END to the compiled pattern. | 2120 // Append the URX_LB_END and URX_LA_END to the compiled pattern. |
2174 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); | 2121 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-4); |
2175 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2122 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
2176 int32_t dataLoc = URX_VAL(startOp); | 2123 int32_t dataLoc = URX_VAL(startOp); |
2177 int32_t op = URX_BUILD(URX_LB_END, dataLoc); | 2124 int32_t op = URX_BUILD(URX_LB_END, dataLoc); |
2178 appendOp(op); | 2125 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2179 op = URX_BUILD(URX_LA_END, dataLoc); | 2126 op = URX_BUILD(URX_LA_END, dataLoc); |
2180 appendOp(op); | 2127 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2181 | 2128 |
2182 // Determine the min and max bounds for the length of the | 2129 // Determine the min and max bounds for the length of the |
2183 // string that the pattern can match. | 2130 // string that the pattern can match. |
2184 // An unbounded upper limit is an error. | 2131 // An unbounded upper limit is an error. |
2185 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2132 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
2186 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2133 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
2187 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2134 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
2188 if (URX_TYPE(maxML) != 0) { | |
2189 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
2190 break; | |
2191 } | |
2192 if (maxML == INT32_MAX) { | 2135 if (maxML == INT32_MAX) { |
2193 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2136 error(U_REGEX_LOOK_BEHIND_LIMIT); |
2194 break; | 2137 break; |
2195 } | 2138 } |
2196 U_ASSERT(minML <= maxML); | 2139 U_ASSERT(minML <= maxML); |
2197 | 2140 |
2198 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2141 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
2199 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2142 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
2200 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); | 2143 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); |
2201 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); | 2144 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); |
2202 | 2145 |
2203 } | 2146 } |
2204 break; | 2147 break; |
2205 | 2148 |
2206 | 2149 |
2207 | 2150 |
2208 case lookBehindN: | 2151 case lookBehindN: |
2209 { | 2152 { |
2210 // See comment at doOpenLookBehindNeg. | 2153 // See comment at doOpenLookBehindNeg. |
2211 | 2154 |
2212 // Append the URX_LBN_END to the compiled pattern. | 2155 // Append the URX_LBN_END to the compiled pattern. |
2213 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); | 2156 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen-5); |
2214 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | 2157 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
2215 int32_t dataLoc = URX_VAL(startOp); | 2158 int32_t dataLoc = URX_VAL(startOp); |
2216 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); | 2159 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); |
2217 appendOp(op); | 2160 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2218 | 2161 |
2219 // Determine the min and max bounds for the length of the | 2162 // Determine the min and max bounds for the length of the |
2220 // string that the pattern can match. | 2163 // string that the pattern can match. |
2221 // An unbounded upper limit is an error. | 2164 // An unbounded upper limit is an error. |
2222 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 2165 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
2223 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 2166 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
2224 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 2167 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
2225 if (URX_TYPE(maxML) != 0) { | |
2226 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
2227 break; | |
2228 } | |
2229 if (maxML == INT32_MAX) { | 2168 if (maxML == INT32_MAX) { |
2230 error(U_REGEX_LOOK_BEHIND_LIMIT); | 2169 error(U_REGEX_LOOK_BEHIND_LIMIT); |
2231 break; | 2170 break; |
2232 } | 2171 } |
2233 U_ASSERT(minML <= maxML); | 2172 U_ASSERT(minML <= maxML); |
2234 | 2173 |
2235 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat | 2174 // Insert the min and max match len bounds into the URX_LB_CONT op t
hat |
2236 // appears at the top of the look-behind block, at location fMatchO
penParen+1 | 2175 // appears at the top of the look-behind block, at location fMatchO
penParen+1 |
2237 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); | 2176 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); |
2238 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); | 2177 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2273 // There shoudn't be any, but just in case. | 2212 // There shoudn't be any, but just in case. |
2274 // (Case Closure can add them; if we had a simple case closure avaialble
that | 2213 // (Case Closure can add them; if we had a simple case closure avaialble
that |
2275 // ignored strings, that would be better.) | 2214 // ignored strings, that would be better.) |
2276 theSet->removeAllStrings(); | 2215 theSet->removeAllStrings(); |
2277 int32_t setSize = theSet->size(); | 2216 int32_t setSize = theSet->size(); |
2278 | 2217 |
2279 switch (setSize) { | 2218 switch (setSize) { |
2280 case 0: | 2219 case 0: |
2281 { | 2220 { |
2282 // Set of no elements. Always fails to match. | 2221 // Set of no elements. Always fails to match. |
2283 appendOp(URX_BUILD(URX_BACKTRACK, 0)); | 2222 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStat
us); |
2284 delete theSet; | 2223 delete theSet; |
2285 } | 2224 } |
2286 break; | 2225 break; |
2287 | 2226 |
2288 case 1: | 2227 case 1: |
2289 { | 2228 { |
2290 // The set contains only a single code point. Put it into | 2229 // The set contains only a single code point. Put it into |
2291 // the compiled pattern as a single char operation rather | 2230 // the compiled pattern as a single char operation rather |
2292 // than a set, and discard the set itself. | 2231 // than a set, and discard the set itself. |
2293 literalChar(theSet->charAt(0)); | 2232 literalChar(theSet->charAt(0)); |
2294 delete theSet; | 2233 delete theSet; |
2295 } | 2234 } |
2296 break; | 2235 break; |
2297 | 2236 |
2298 default: | 2237 default: |
2299 { | 2238 { |
2300 // The set contains two or more chars. (the normal case) | 2239 // The set contains two or more chars. (the normal case) |
2301 // Put it into the compiled pattern as a set. | 2240 // Put it into the compiled pattern as a set. |
2302 int32_t setNumber = fRXPat->fSets->size(); | 2241 int32_t setNumber = fRXPat->fSets->size(); |
2303 fRXPat->fSets->addElement(theSet, *fStatus); | 2242 fRXPat->fSets->addElement(theSet, *fStatus); |
2304 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | 2243 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); |
2305 appendOp(setOp); | 2244 fRXPat->fCompiledPat->addElement(setOp, *fStatus); |
2306 } | 2245 } |
2307 } | 2246 } |
2308 } | 2247 } |
2309 | 2248 |
2310 | 2249 |
2311 //------------------------------------------------------------------------------ | 2250 //------------------------------------------------------------------------------ |
2312 // | 2251 // |
2313 // compileInterval Generate the code for a {min, max} style interval quanti
fier. | 2252 // compileInterval Generate the code for a {min, max} style interval quanti
fier. |
2314 // Except for the specific opcodes used, the code is the sa
me | 2253 // Except for the specific opcodes used, the code is the sa
me |
2315 // for all three types (greedy, non-greedy, possessive) of | 2254 // for all three types (greedy, non-greedy, possessive) of |
(...skipping 18 matching lines...) Expand all Loading... |
2334 int32_t topOfBlock = blockTopLoc(TRUE); | 2273 int32_t topOfBlock = blockTopLoc(TRUE); |
2335 insertOp(topOfBlock); | 2274 insertOp(topOfBlock); |
2336 insertOp(topOfBlock); | 2275 insertOp(topOfBlock); |
2337 insertOp(topOfBlock); | 2276 insertOp(topOfBlock); |
2338 | 2277 |
2339 // The operands for the CTR_INIT opcode include the index in the matcher dat
a | 2278 // The operands for the CTR_INIT opcode include the index in the matcher dat
a |
2340 // of the counter. Allocate it now. There are two data items | 2279 // of the counter. Allocate it now. There are two data items |
2341 // counterLoc --> Loop counter | 2280 // counterLoc --> Loop counter |
2342 // +1 --> Input index (for breaking non-progressing loops) | 2281 // +1 --> Input index (for breaking non-progressing loops) |
2343 // (Only present if unbounded upper limit on loop) | 2282 // (Only present if unbounded upper limit on loop) |
2344 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; | 2283 int32_t counterLoc = fRXPat->fFrameSize; |
2345 int32_t counterLoc = allocateStackData(dataSize); | 2284 fRXPat->fFrameSize++; |
| 2285 if (fIntervalUpper < 0) { |
| 2286 fRXPat->fFrameSize++; |
| 2287 } |
2346 | 2288 |
2347 int32_t op = URX_BUILD(InitOp, counterLoc); | 2289 int32_t op = URX_BUILD(InitOp, counterLoc); |
2348 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | 2290 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); |
2349 | 2291 |
2350 // The second operand of CTR_INIT is the location following the end of the l
oop. | 2292 // The second operand of CTR_INIT is the location following the end of the l
oop. |
2351 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the | 2293 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if
the |
2352 // compilation of something later on causes the code to grow and the targe
t | 2294 // compilation of something later on causes the code to grow and the targe
t |
2353 // position to move. | 2295 // position to move. |
2354 int32_t loopEnd = fRXPat->fCompiledPat->size(); | 2296 int32_t loopEnd = fRXPat->fCompiledPat->size(); |
2355 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | 2297 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); |
2356 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | 2298 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); |
2357 | 2299 |
2358 // Followed by the min and max counts. | 2300 // Followed by the min and max counts. |
2359 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); | 2301 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); |
2360 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); | 2302 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); |
2361 | 2303 |
2362 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. | 2304 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. |
2363 // Goes at end of the block being looped over, so just append to the code
so far. | 2305 // Goes at end of the block being looped over, so just append to the code
so far. |
2364 op = URX_BUILD(LoopOp, topOfBlock); | 2306 op = URX_BUILD(LoopOp, topOfBlock); |
2365 appendOp(op); | 2307 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2366 | 2308 |
2367 if ((fIntervalLow & 0xff000000) != 0 || | 2309 if ((fIntervalLow & 0xff000000) != 0 || |
2368 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { | 2310 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { |
2369 error(U_REGEX_NUMBER_TOO_BIG); | 2311 error(U_REGEX_NUMBER_TOO_BIG); |
2370 } | 2312 } |
2371 | 2313 |
2372 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { | 2314 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { |
2373 error(U_REGEX_MAX_LT_MIN); | 2315 error(U_REGEX_MAX_LT_MIN); |
2374 } | 2316 } |
2375 } | 2317 } |
2376 | 2318 |
2377 | 2319 |
2378 | 2320 |
2379 UBool RegexCompile::compileInlineInterval() { | 2321 UBool RegexCompile::compileInlineInterval() { |
2380 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { | 2322 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { |
2381 // Too big to inline. Fail, which will cause looping code to be generat
ed. | 2323 // Too big to inline. Fail, which will cause looping code to be generat
ed. |
2382 // (Upper < Lower picks up unbounded upper and errors, both.) | 2324 // (Upper < Lower picks up unbounded upper and errors, both.) |
2383 return FALSE; | 2325 return FALSE; |
2384 } | 2326 } |
2385 | 2327 |
2386 int32_t topOfBlock = blockTopLoc(FALSE); | 2328 int32_t topOfBlock = blockTopLoc(FALSE); |
2387 if (fIntervalUpper == 0) { | 2329 if (fIntervalUpper == 0) { |
2388 // Pathological case. Attempt no matches, as if the block doesn't exist
. | 2330 // Pathological case. Attempt no matches, as if the block doesn't exist
. |
2389 // Discard the generated code for the block. | |
2390 // If the block included parens, discard the info pertaining to them as
well. | |
2391 fRXPat->fCompiledPat->setSize(topOfBlock); | 2331 fRXPat->fCompiledPat->setSize(topOfBlock); |
2392 if (fMatchOpenParen >= topOfBlock) { | |
2393 fMatchOpenParen = -1; | |
2394 } | |
2395 if (fMatchCloseParen >= topOfBlock) { | |
2396 fMatchCloseParen = -1; | |
2397 } | |
2398 return TRUE; | 2332 return TRUE; |
2399 } | 2333 } |
2400 | 2334 |
2401 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { | 2335 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { |
2402 // The thing being repeated is not a single op, but some | 2336 // The thing being repeated is not a single op, but some |
2403 // more complex block. Do it as a loop, not inlines. | 2337 // more complex block. Do it as a loop, not inlines. |
2404 // Note that things "repeated" a max of once are handled as inline, be
cause | 2338 // Note that things "repeated" a max of once are handled as inline, be
cause |
2405 // the one copy of the code already generated is just fine. | 2339 // the one copy of the code already generated is just fine. |
2406 return FALSE; | 2340 return FALSE; |
2407 } | 2341 } |
(...skipping 14 matching lines...) Expand all Loading... |
2422 } | 2356 } |
2423 | 2357 |
2424 | 2358 |
2425 | 2359 |
2426 // Loop, emitting the op for the thing being repeated each time. | 2360 // Loop, emitting the op for the thing being repeated each time. |
2427 // Loop starts at 1 because one instance of the op already exists in the
pattern, | 2361 // Loop starts at 1 because one instance of the op already exists in the
pattern, |
2428 // it was put there when it was originally encountered. | 2362 // it was put there when it was originally encountered. |
2429 int32_t i; | 2363 int32_t i; |
2430 for (i=1; i<fIntervalUpper; i++ ) { | 2364 for (i=1; i<fIntervalUpper; i++ ) { |
2431 if (i == fIntervalLow) { | 2365 if (i == fIntervalLow) { |
2432 appendOp(saveOp); | 2366 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
2433 } | 2367 } |
2434 if (i > fIntervalLow) { | 2368 if (i > fIntervalLow) { |
2435 appendOp(saveOp); | 2369 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
2436 } | 2370 } |
2437 appendOp(op); | 2371 fRXPat->fCompiledPat->addElement(op, *fStatus); |
2438 } | 2372 } |
2439 return TRUE; | 2373 return TRUE; |
2440 } | 2374 } |
2441 | 2375 |
2442 | 2376 |
2443 | 2377 |
2444 //------------------------------------------------------------------------------ | 2378 //------------------------------------------------------------------------------ |
2445 // | 2379 // |
| 2380 // caseInsensitiveStart given a single code point from a pattern string, dete
rmine the |
| 2381 // set of characters that could potentially begin a case
-insensitive |
| 2382 // match of a string beginning with that character, usin
g full Unicode |
| 2383 // case insensitive matching. |
| 2384 // |
| 2385 // This is used in optimizing find(). |
| 2386 // |
| 2387 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but |
| 2388 // misses cases like this: |
| 2389 // A string from the pattern begins with 'ss' (although all we know |
| 2390 // in this context is that it begins with 's') |
| 2391 // The pattern could match a string beginning with a German sharp-s |
| 2392 // |
| 2393 // To the ordinary case closure for a character c, we add all other |
| 2394 // characters cx where the case closure of cx incudes a string form th
at begins |
| 2395 // with the original character c. |
| 2396 // |
| 2397 // This function could be made smarter. The full pattern string is ava
ilable |
| 2398 // and it would be possible to verify that the extra characters being
added |
| 2399 // to the starting set fully match, rather than having just a first-ch
ar of the |
| 2400 // folded form match. |
| 2401 // |
| 2402 //------------------------------------------------------------------------------ |
| 2403 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh
ars) { |
| 2404 |
| 2405 // Machine Generated below. |
| 2406 // It may need updating with new versions of Unicode. |
| 2407 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update i
s needed. |
| 2408 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/tru
nk/unicode/c/genregexcasing |
| 2409 |
| 2410 // Machine Generated Data. Do not hand edit. |
| 2411 static const UChar32 RECaseFixCodePoints[] = { |
| 2412 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, |
| 2413 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, |
| 2414 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x
1f07, |
| 2415 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60,
0x1f61, |
| 2416 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c,
0x110000}; |
| 2417 |
| 2418 static const int16_t RECaseFixStringOffsets[] = { |
| 2419 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, |
| 2420 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, |
| 2421 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, |
| 2422 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, |
| 2423 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0}; |
| 2424 |
| 2425 static const int16_t RECaseFixCounts[] = { |
| 2426 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, |
| 2427 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, |
| 2428 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, |
| 2429 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, |
| 2430 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0}; |
| 2431 |
| 2432 static const UChar RECaseFixData[] = { |
| 2433 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0x
df, |
| 2434 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0
x1fb3, |
| 2435 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0
x1fd3, |
| 2436 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0
x1fe3, |
| 2437 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0
xfb14, |
| 2438 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a,
0x1f83, |
| 2439 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f,
0x1f90, |
| 2440 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c,
0x1f95, |
| 2441 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9,
0x1fa2, |
| 2442 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae,
0x1fa7, |
| 2443 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0}; |
| 2444 |
| 2445 // End of machine generated data. |
| 2446 |
| 2447 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
| 2448 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); |
| 2449 starterChars->set(caseFoldedC, caseFoldedC); |
| 2450 |
| 2451 int32_t i; |
| 2452 for (i=0; RECaseFixCodePoints[i]<c ; i++) { |
| 2453 // Simple linear search through the sorted list of interesting code
points. |
| 2454 } |
| 2455 |
| 2456 if (RECaseFixCodePoints[i] == c) { |
| 2457 int32_t dataIndex = RECaseFixStringOffsets[i]; |
| 2458 int32_t numCharsToAdd = RECaseFixCounts[i]; |
| 2459 UChar32 cpToAdd = 0; |
| 2460 for (int32_t j=0; j<numCharsToAdd; j++) { |
| 2461 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd); |
| 2462 starterChars->add(cpToAdd); |
| 2463 } |
| 2464 } |
| 2465 |
| 2466 starterChars->closeOver(USET_CASE_INSENSITIVE); |
| 2467 starterChars->removeAllStrings(); |
| 2468 } else { |
| 2469 // Not a cased character. Just return it alone. |
| 2470 starterChars->set(c, c); |
| 2471 } |
| 2472 } |
| 2473 |
| 2474 |
| 2475 |
| 2476 |
| 2477 //------------------------------------------------------------------------------ |
| 2478 // |
2446 // matchStartType Determine how a match can start. | 2479 // matchStartType Determine how a match can start. |
2447 // Used to optimize find() operations. | 2480 // Used to optimize find() operations. |
2448 // | 2481 // |
2449 // Operation is very similar to minMatchLength(). Walk the
compiled | 2482 // Operation is very similar to minMatchLength(). Walk the
compiled |
2450 // pattern, keeping an on-going minimum-match-length. For a
ny | 2483 // pattern, keeping an on-going minimum-match-length. For a
ny |
2451 // op where the min match coming in is zero, add that ops po
ssible | 2484 // op where the min match coming in is zero, add that ops po
ssible |
2452 // starting matches to the possible starts for the overall p
attern. | 2485 // starting matches to the possible starts for the overall p
attern. |
2453 // | 2486 // |
2454 //------------------------------------------------------------------------------ | 2487 //------------------------------------------------------------------------------ |
2455 void RegexCompile::matchStartType() { | 2488 void RegexCompile::matchStartType() { |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2507 case URX_BACKSLASH_G: | 2540 case URX_BACKSLASH_G: |
2508 case URX_BACKSLASH_Z: | 2541 case URX_BACKSLASH_Z: |
2509 case URX_DOLLAR: | 2542 case URX_DOLLAR: |
2510 case URX_DOLLAR_M: | 2543 case URX_DOLLAR_M: |
2511 case URX_DOLLAR_D: | 2544 case URX_DOLLAR_D: |
2512 case URX_DOLLAR_MD: | 2545 case URX_DOLLAR_MD: |
2513 case URX_RELOC_OPRND: | 2546 case URX_RELOC_OPRND: |
2514 case URX_STO_INP_LOC: | 2547 case URX_STO_INP_LOC: |
2515 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match | 2548 case URX_BACKREF: // BackRef. Must assume that it might be a ze
ro length match |
2516 case URX_BACKREF_I: | 2549 case URX_BACKREF_I: |
2517 | 2550 |
2518 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. | 2551 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe
sn't change what can match. |
2519 case URX_LD_SP: | 2552 case URX_LD_SP: |
2520 break; | 2553 break; |
2521 | 2554 |
2522 case URX_CARET: | 2555 case URX_CARET: |
2523 if (atStart) { | 2556 if (atStart) { |
2524 fRXPat->fStartType = START_START; | 2557 fRXPat->fStartType = START_START; |
2525 } | 2558 } |
2526 break; | 2559 break; |
2527 | 2560 |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2624 currentLen++; | 2657 currentLen++; |
2625 atStart = FALSE; | 2658 atStart = FALSE; |
2626 break; | 2659 break; |
2627 | 2660 |
2628 | 2661 |
2629 case URX_ONECHAR_I: | 2662 case URX_ONECHAR_I: |
2630 // Case Insensitive Single Character. | 2663 // Case Insensitive Single Character. |
2631 if (currentLen == 0) { | 2664 if (currentLen == 0) { |
2632 UChar32 c = URX_VAL(op); | 2665 UChar32 c = URX_VAL(op); |
2633 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { | 2666 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { |
2634 | 2667 UnicodeSet starters(c, c); |
2635 // Disable optimizations on first char of match. | 2668 starters.closeOver(USET_CASE_INSENSITIVE); |
2636 // TODO: Compute the set of chars that case fold to this cha
r, or to | 2669 // findCaseInsensitiveStarters(c, &starters); |
2637 // a string that begins with this char. | 2670 // For ONECHAR_I, no need to worry about text chars that e
xpand on folding into strings. |
2638 // For simple case folding, this code worked: | 2671 // The expanded folding can't match the pattern. |
2639 // UnicodeSet s(c, c); | 2672 fRXPat->fInitialChars->addAll(starters); |
2640 // s.closeOver(USET_CASE_INSENSITIVE); | |
2641 // fRXPat->fInitialChars->addAll(s); | |
2642 | |
2643 fRXPat->fInitialChars->clear(); | |
2644 fRXPat->fInitialChars->complement(); | |
2645 } else { | 2673 } else { |
2646 // Char has no case variants. Just add it as-is to the | 2674 // Char has no case variants. Just add it as-is to the |
2647 // set of possible starting chars. | 2675 // set of possible starting chars. |
2648 fRXPat->fInitialChars->add(c); | 2676 fRXPat->fInitialChars->add(c); |
2649 } | 2677 } |
2650 numInitialStrings += 2; | 2678 numInitialStrings += 2; |
2651 } | 2679 } |
2652 currentLen++; | 2680 currentLen++; |
2653 atStart = FALSE; | 2681 atStart = FALSE; |
2654 break; | 2682 break; |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2757 loc++; | 2785 loc++; |
2758 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); | 2786 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(
loc); |
2759 int32_t stringLen = URX_VAL(stringLenOp); | 2787 int32_t stringLen = URX_VAL(stringLenOp); |
2760 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); | 2788 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); |
2761 U_ASSERT(stringLenOp >= 2); | 2789 U_ASSERT(stringLenOp >= 2); |
2762 if (currentLen == 0) { | 2790 if (currentLen == 0) { |
2763 // Add the starting character of this string to the set of p
ossible starting | 2791 // Add the starting character of this string to the set of p
ossible starting |
2764 // characters for this pattern. | 2792 // characters for this pattern. |
2765 int32_t stringStartIdx = URX_VAL(op); | 2793 int32_t stringStartIdx = URX_VAL(op); |
2766 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); | 2794 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); |
2767 UnicodeSet s(c, c); | 2795 UnicodeSet s; |
2768 | 2796 findCaseInsensitiveStarters(c, &s); |
2769 // TODO: compute correct set of starting chars for full cas
e folding. | |
2770 // For the moment, say any char can start. | |
2771 // s.closeOver(USET_CASE_INSENSITIVE); | |
2772 s.clear(); | |
2773 s.complement(); | |
2774 | |
2775 fRXPat->fInitialChars->addAll(s); | 2797 fRXPat->fInitialChars->addAll(s); |
2776 numInitialStrings += 2; // Matching on an initial string no
t possible. | 2798 numInitialStrings += 2; // Matching on an initial string no
t possible. |
2777 } | 2799 } |
2778 currentLen += stringLen; | 2800 currentLen += stringLen; |
2779 atStart = FALSE; | 2801 atStart = FALSE; |
2780 } | 2802 } |
2781 break; | 2803 break; |
2782 | 2804 |
2783 case URX_CTR_INIT: | 2805 case URX_CTR_INIT: |
2784 case URX_CTR_INIT_NG: | 2806 case URX_CTR_INIT_NG: |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2820 // don't change the minimum match | 2842 // don't change the minimum match |
2821 atStart = FALSE; | 2843 atStart = FALSE; |
2822 break; | 2844 break; |
2823 | 2845 |
2824 | 2846 |
2825 case URX_LA_START: | 2847 case URX_LA_START: |
2826 case URX_LB_START: | 2848 case URX_LB_START: |
2827 { | 2849 { |
2828 // Look-around. Scan forward until the matching look-ahead end, | 2850 // Look-around. Scan forward until the matching look-ahead end, |
2829 // without processing the look-around block. This is overly p
essimistic. | 2851 // without processing the look-around block. This is overly p
essimistic. |
2830 | 2852 |
2831 // Keep track of the nesting depth of look-around blocks. Boile
rplate code for | 2853 // Keep track of the nesting depth of look-around blocks. Boile
rplate code for |
2832 // lookahead contains two LA_END instructions, so count goes u
p by two | 2854 // lookahead contains two LA_END instructions, so count goes u
p by two |
2833 // for each LA_START. | 2855 // for each LA_START. |
2834 int32_t depth = (opType == URX_LA_START? 2: 1); | 2856 int32_t depth = (opType == URX_LA_START? 2: 1); |
2835 for (;;) { | 2857 for (;;) { |
2836 loc++; | 2858 loc++; |
2837 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); | 2859 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
2838 if (URX_TYPE(op) == URX_LA_START) { | 2860 if (URX_TYPE(op) == URX_LA_START) { |
2839 depth+=2; | 2861 depth+=2; |
2840 } | 2862 } |
(...skipping 539 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3380 | 3402 |
3381 case URX_STRING_I: | 3403 case URX_STRING_I: |
3382 // TODO: This code assumes that any user string that matches will b
e no longer | 3404 // TODO: This code assumes that any user string that matches will b
e no longer |
3383 // than our compiled string, with case insensitive matching. | 3405 // than our compiled string, with case insensitive matching. |
3384 // Our compiled string has been case-folded already. | 3406 // Our compiled string has been case-folded already. |
3385 // | 3407 // |
3386 // Any matching user string will have no more code points tha
n our | 3408 // Any matching user string will have no more code points tha
n our |
3387 // compiled (folded) string. Folding may add code points, bu
t | 3409 // compiled (folded) string. Folding may add code points, bu
t |
3388 // not remove them. | 3410 // not remove them. |
3389 // | 3411 // |
3390 // There is a potential problem if a supplemental code point | 3412 // There is a potential problem if a supplemental code point |
3391 // case-folds to a BMP code point. In this case our compiled
string | 3413 // case-folds to a BMP code point. In this case our compiled
string |
3392 // could be shorter (in code units) than a matching user stri
ng. | 3414 // could be shorter (in code units) than a matching user stri
ng. |
3393 // | 3415 // |
3394 // At this time (Unicode 6.1) there are no such characters, a
nd this case | 3416 // At this time (Unicode 6.1) there are no such characters, a
nd this case |
3395 // is not being handled. A test, intltest regex/Bug9283, wil
l fail if | 3417 // is not being handled. A test, intltest regex/Bug9283, wil
l fail if |
3396 // any problematic characters are added to Unicode. | 3418 // any problematic characters are added to Unicode. |
3397 // | 3419 // |
3398 // If this happens, we can make a set of the BMP chars that t
he | 3420 // If this happens, we can make a set of the BMP chars that t
he |
3399 // troublesome supplementals fold to, scan our string, and bu
mp the | 3421 // troublesome supplementals fold to, scan our string, and bu
mp the |
3400 // currentLen one extra for each that is found. | 3422 // currentLen one extra for each that is found. |
(...skipping 10 matching lines...) Expand all Loading... |
3411 // For Loops, recursively call this function on the pattern for the
loop body, | 3433 // For Loops, recursively call this function on the pattern for the
loop body, |
3412 // then multiply the result by the maximum loop count. | 3434 // then multiply the result by the maximum loop count. |
3413 { | 3435 { |
3414 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(l
oc+1)); | 3436 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(l
oc+1)); |
3415 if (loopEndLoc == loc+4) { | 3437 if (loopEndLoc == loc+4) { |
3416 // Loop has an empty body. No affect on max match length. | 3438 // Loop has an empty body. No affect on max match length. |
3417 // Continue processing with code after the loop end. | 3439 // Continue processing with code after the loop end. |
3418 loc = loopEndLoc; | 3440 loc = loopEndLoc; |
3419 break; | 3441 break; |
3420 } | 3442 } |
3421 | 3443 |
3422 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); | 3444 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); |
3423 if (maxLoopCount == -1) { | 3445 if (maxLoopCount == -1) { |
3424 // Unbounded Loop. No upper bound on match length. | 3446 // Unbounded Loop. No upper bound on match length. |
3425 currentLen = INT32_MAX; | 3447 currentLen = INT32_MAX; |
3426 break; | 3448 break; |
3427 } | 3449 } |
3428 | 3450 |
3429 U_ASSERT(loopEndLoc >= loc+4); | 3451 U_ASSERT(loopEndLoc >= loc+4); |
3430 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec
ursive call. | 3452 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec
ursive call. |
3431 if (blockLen == INT32_MAX) { | 3453 if (blockLen == INT32_MAX) { |
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3529 // will be offset at each location in the original code. | 3551 // will be offset at each location in the original code. |
3530 int32_t loc; | 3552 int32_t loc; |
3531 int32_t d = 0; | 3553 int32_t d = 0; |
3532 for (loc=0; loc<end; loc++) { | 3554 for (loc=0; loc<end; loc++) { |
3533 deltas.addElement(d, *fStatus); | 3555 deltas.addElement(d, *fStatus); |
3534 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); | 3556 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); |
3535 if (URX_TYPE(op) == URX_NOP) { | 3557 if (URX_TYPE(op) == URX_NOP) { |
3536 d++; | 3558 d++; |
3537 } | 3559 } |
3538 } | 3560 } |
3539 | 3561 |
3540 UnicodeString caseStringBuffer; | 3562 UnicodeString caseStringBuffer; |
3541 | 3563 |
3542 // Make a second pass over the code, removing the NOPs by moving following | 3564 // Make a second pass over the code, removing the NOPs by moving following |
3543 // code up, and patching operands that refer to code locations that | 3565 // code up, and patching operands that refer to code locations that |
3544 // are being moved. The array of offsets from the first step is used | 3566 // are being moved. The array of offsets from the first step is used |
3545 // to compute the new operand values. | 3567 // to compute the new operand values. |
3546 int32_t src; | 3568 int32_t src; |
3547 int32_t dst = 0; | 3569 int32_t dst = 0; |
3548 for (src=0; src<end; src++) { | 3570 for (src=0; src<end; src++) { |
3549 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); | 3571 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); |
(...skipping 26 matching lines...) Expand all Loading... |
3576 { | 3598 { |
3577 int32_t where = URX_VAL(op); | 3599 int32_t where = URX_VAL(op); |
3578 if (where > fRXPat->fGroupMap->size()) { | 3600 if (where > fRXPat->fGroupMap->size()) { |
3579 error(U_REGEX_INVALID_BACK_REF); | 3601 error(U_REGEX_INVALID_BACK_REF); |
3580 break; | 3602 break; |
3581 } | 3603 } |
3582 where = fRXPat->fGroupMap->elementAti(where-1); | 3604 where = fRXPat->fGroupMap->elementAti(where-1); |
3583 op = URX_BUILD(opType, where); | 3605 op = URX_BUILD(opType, where); |
3584 fRXPat->fCompiledPat->setElementAt(op, dst); | 3606 fRXPat->fCompiledPat->setElementAt(op, dst); |
3585 dst++; | 3607 dst++; |
3586 | 3608 |
3587 fRXPat->fNeedsAltInput = TRUE; | 3609 fRXPat->fNeedsAltInput = TRUE; |
3588 break; | 3610 break; |
3589 } | 3611 } |
3590 case URX_RESERVED_OP: | 3612 case URX_RESERVED_OP: |
3591 case URX_RESERVED_OP_N: | 3613 case URX_RESERVED_OP_N: |
3592 case URX_BACKTRACK: | 3614 case URX_BACKTRACK: |
3593 case URX_END: | 3615 case URX_END: |
3594 case URX_ONECHAR: | 3616 case URX_ONECHAR: |
3595 case URX_STRING: | 3617 case URX_STRING: |
3596 case URX_STRING_LEN: | 3618 case URX_STRING_LEN: |
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3667 if (fLineNum > 0x7FFFFFFF) { | 3689 if (fLineNum > 0x7FFFFFFF) { |
3668 fParseErr->line = 0; | 3690 fParseErr->line = 0; |
3669 fParseErr->offset = -1; | 3691 fParseErr->offset = -1; |
3670 } else if (fCharNum > 0x7FFFFFFF) { | 3692 } else if (fCharNum > 0x7FFFFFFF) { |
3671 fParseErr->line = (int32_t)fLineNum; | 3693 fParseErr->line = (int32_t)fLineNum; |
3672 fParseErr->offset = -1; | 3694 fParseErr->offset = -1; |
3673 } else { | 3695 } else { |
3674 fParseErr->line = (int32_t)fLineNum; | 3696 fParseErr->line = (int32_t)fLineNum; |
3675 fParseErr->offset = (int32_t)fCharNum; | 3697 fParseErr->offset = (int32_t)fCharNum; |
3676 } | 3698 } |
3677 | 3699 |
3678 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con
text | 3700 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con
text |
3679 | 3701 |
3680 // Fill in the context. | 3702 // Fill in the context. |
3681 // Note: extractBetween() pins supplied indicies to the string bounds. | 3703 // Note: extractBetween() pins supplied indicies to the string bounds. |
3682 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); | 3704 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); |
3683 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); | 3705 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); |
3684 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI
ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); | 3706 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI
ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); |
3685 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L
EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); | 3707 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L
EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); |
3686 } | 3708 } |
3687 } | 3709 } |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3721 // | 3743 // |
3722 //------------------------------------------------------------------------------ | 3744 //------------------------------------------------------------------------------ |
3723 UChar32 RegexCompile::nextCharLL() { | 3745 UChar32 RegexCompile::nextCharLL() { |
3724 UChar32 ch; | 3746 UChar32 ch; |
3725 | 3747 |
3726 if (fPeekChar != -1) { | 3748 if (fPeekChar != -1) { |
3727 ch = fPeekChar; | 3749 ch = fPeekChar; |
3728 fPeekChar = -1; | 3750 fPeekChar = -1; |
3729 return ch; | 3751 return ch; |
3730 } | 3752 } |
3731 | 3753 |
3732 // assume we're already in the right place | 3754 // assume we're already in the right place |
3733 ch = UTEXT_NEXT32(fRXPat->fPattern); | 3755 ch = UTEXT_NEXT32(fRXPat->fPattern); |
3734 if (ch == U_SENTINEL) { | 3756 if (ch == U_SENTINEL) { |
3735 return ch; | 3757 return ch; |
3736 } | 3758 } |
3737 | 3759 |
3738 if (ch == chCR || | 3760 if (ch == chCR || |
3739 ch == chNEL || | 3761 ch == chNEL || |
3740 ch == chLS || | 3762 ch == chLS || |
3741 (ch == chLF && fLastChar != chCR)) { | 3763 (ch == chLF && fLastChar != chCR)) { |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3777 // | 3799 // |
3778 //------------------------------------------------------------------------------ | 3800 //------------------------------------------------------------------------------ |
3779 void RegexCompile::nextChar(RegexPatternChar &c) { | 3801 void RegexCompile::nextChar(RegexPatternChar &c) { |
3780 | 3802 |
3781 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); | 3803 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
3782 c.fChar = nextCharLL(); | 3804 c.fChar = nextCharLL(); |
3783 c.fQuoted = FALSE; | 3805 c.fQuoted = FALSE; |
3784 | 3806 |
3785 if (fQuoteMode) { | 3807 if (fQuoteMode) { |
3786 c.fQuoted = TRUE; | 3808 c.fQuoted = TRUE; |
3787 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_
LITERAL) == 0)) || | 3809 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_
LITERAL) == 0)) || |
3788 c.fChar == (UChar32)-1) { | 3810 c.fChar == (UChar32)-1) { |
3789 fQuoteMode = FALSE; // Exit quote mode, | 3811 fQuoteMode = FALSE; // Exit quote mode, |
3790 nextCharLL(); // discard the E | 3812 nextCharLL(); // discard the E |
3791 nextChar(c); // recurse to get the real next char | 3813 nextChar(c); // recurse to get the real next char |
3792 } | 3814 } |
3793 } | 3815 } |
3794 else if (fInBackslashQuote) { | 3816 else if (fInBackslashQuote) { |
3795 // The current character immediately follows a '\' | 3817 // The current character immediately follows a '\' |
3796 // Don't check for any further escapes, just return it as-is. | 3818 // Don't check for any further escapes, just return it as-is. |
3797 // Don't set c.fQuoted, because that would prevent the state machine fro
m | 3819 // Don't set c.fQuoted, because that would prevent the state machine fro
m |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3838 if (c.fChar == chBackSlash) { | 3860 if (c.fChar == chBackSlash) { |
3839 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); | 3861 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); |
3840 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar
LL())) { | 3862 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar
LL())) { |
3841 // | 3863 // |
3842 // A '\' sequence that is handled by ICU's standard unescapeAt f
unction. | 3864 // A '\' sequence that is handled by ICU's standard unescapeAt f
unction. |
3843 // Includes \uxxxx, \n, \r, many others. | 3865 // Includes \uxxxx, \n, \r, many others. |
3844 // Return the single equivalent character. | 3866 // Return the single equivalent character. |
3845 // | 3867 // |
3846 nextCharLL(); // get & discard the peeked char. | 3868 nextCharLL(); // get & discard the peeked char. |
3847 c.fQuoted = TRUE; | 3869 c.fQuoted = TRUE; |
3848 | 3870 |
3849 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength))
{ | 3871 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength))
{ |
3850 int32_t endIndex = (int32_t)pos; | 3872 int32_t endIndex = (int32_t)pos; |
3851 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd
ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); | 3873 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd
ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); |
3852 | 3874 |
3853 if (endIndex == pos) { | 3875 if (endIndex == pos) { |
3854 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 3876 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
3855 } | 3877 } |
3856 fCharNum += endIndex - pos; | 3878 fCharNum += endIndex - pos; |
3857 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); | 3879 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); |
3858 } else { | 3880 } else { |
3859 int32_t offset = 0; | 3881 int32_t offset = 0; |
3860 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX
T_UNESCAPE_CONTEXT(fRXPat->fPattern); | 3882 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX
T_UNESCAPE_CONTEXT(fRXPat->fPattern); |
3861 | 3883 |
3862 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); | 3884 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); |
3863 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset
, INT32_MAX, &context); | 3885 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset
, INT32_MAX, &context); |
3864 | 3886 |
3865 if (offset == 0) { | 3887 if (offset == 0) { |
3866 error(U_REGEX_BAD_ESCAPE_SEQUENCE); | 3888 error(U_REGEX_BAD_ESCAPE_SEQUENCE); |
3867 } else if (context.lastOffset == offset) { | 3889 } else if (context.lastOffset == offset) { |
3868 UTEXT_PREVIOUS32(fRXPat->fPattern); | 3890 UTEXT_PREVIOUS32(fRXPat->fPattern); |
3869 } else if (context.lastOffset != offset-1) { | 3891 } else if (context.lastOffset != offset-1) { |
3870 utext_moveIndex32(fRXPat->fPattern, offset - context.las
tOffset - 1); | 3892 utext_moveIndex32(fRXPat->fPattern, offset - context.las
tOffset - 1); |
3871 } | 3893 } |
(...skipping 22 matching lines...) Expand all Loading... |
3894 } | 3916 } |
3895 c.fChar <<= 3; | 3917 c.fChar <<= 3; |
3896 c.fChar += ch&7; | 3918 c.fChar += ch&7; |
3897 if (c.fChar <= 255) { | 3919 if (c.fChar <= 255) { |
3898 nextCharLL(); | 3920 nextCharLL(); |
3899 } else { | 3921 } else { |
3900 // The last digit made the number too big. Forget we sa
w it. | 3922 // The last digit made the number too big. Forget we sa
w it. |
3901 c.fChar >>= 3; | 3923 c.fChar >>= 3; |
3902 } | 3924 } |
3903 } | 3925 } |
3904 c.fQuoted = TRUE; | 3926 c.fQuoted = TRUE; |
3905 } | 3927 } |
3906 else if (peekCharLL() == chQ) { | 3928 else if (peekCharLL() == chQ) { |
3907 // "\Q" enter quote mode, which will continue until "\E" | 3929 // "\Q" enter quote mode, which will continue until "\E" |
3908 fQuoteMode = TRUE; | 3930 fQuoteMode = TRUE; |
3909 nextCharLL(); // discard the 'Q'. | 3931 nextCharLL(); // discard the 'Q'. |
3910 nextChar(c); // recurse to get the real next char. | 3932 nextChar(c); // recurse to get the real next char. |
3911 } | 3933 } |
3912 else | 3934 else |
3913 { | 3935 { |
3914 // We are in a '\' escape that will be handled by the state tabl
e scanner. | 3936 // We are in a '\' escape that will be handled by the state tabl
e scanner. |
3915 // Just return the backslash, but remember that the following ch
ar is to | 3937 // Just return the backslash, but remember that the following ch
ar is to |
(...skipping 27 matching lines...) Expand all Loading... |
3943 UChar32 RegexCompile::scanNamedChar() { | 3965 UChar32 RegexCompile::scanNamedChar() { |
3944 if (U_FAILURE(*fStatus)) { | 3966 if (U_FAILURE(*fStatus)) { |
3945 return 0; | 3967 return 0; |
3946 } | 3968 } |
3947 | 3969 |
3948 nextChar(fC); | 3970 nextChar(fC); |
3949 if (fC.fChar != chLBrace) { | 3971 if (fC.fChar != chLBrace) { |
3950 error(U_REGEX_PROPERTY_SYNTAX); | 3972 error(U_REGEX_PROPERTY_SYNTAX); |
3951 return 0; | 3973 return 0; |
3952 } | 3974 } |
3953 | 3975 |
3954 UnicodeString charName; | 3976 UnicodeString charName; |
3955 for (;;) { | 3977 for (;;) { |
3956 nextChar(fC); | 3978 nextChar(fC); |
3957 if (fC.fChar == chRBrace) { | 3979 if (fC.fChar == chRBrace) { |
3958 break; | 3980 break; |
3959 } | 3981 } |
3960 if (fC.fChar == -1) { | 3982 if (fC.fChar == -1) { |
3961 error(U_REGEX_PROPERTY_SYNTAX); | 3983 error(U_REGEX_PROPERTY_SYNTAX); |
3962 return 0; | 3984 return 0; |
3963 } | 3985 } |
3964 charName.append(fC.fChar); | 3986 charName.append(fC.fChar); |
3965 } | 3987 } |
3966 | 3988 |
3967 char name[100]; | 3989 char name[100]; |
3968 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || | 3990 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || |
3969 (uint32_t)charName.length()>=sizeof(name)) { | 3991 (uint32_t)charName.length()>=sizeof(name)) { |
3970 // All Unicode character names have only invariant characters. | 3992 // All Unicode character names have only invariant characters. |
3971 // The API to get a character, given a name, accepts only char *, forcin
g us to convert, | 3993 // The API to get a character, given a name, accepts only char *, forcin
g us to convert, |
3972 // which requires this error check | 3994 // which requires this error check |
3973 error(U_REGEX_PROPERTY_SYNTAX); | 3995 error(U_REGEX_PROPERTY_SYNTAX); |
3974 return 0; | 3996 return 0; |
3975 } | 3997 } |
3976 charName.extract(0, charName.length(), name, sizeof(name), US_INV); | 3998 charName.extract(0, charName.length(), name, sizeof(name), US_INV); |
(...skipping 18 matching lines...) Expand all Loading... |
3995 // Return a UnicodeSet, constructed from the \P pattern, | 4017 // Return a UnicodeSet, constructed from the \P pattern, |
3996 // or NULL if the pattern is invalid. | 4018 // or NULL if the pattern is invalid. |
3997 // | 4019 // |
3998 //------------------------------------------------------------------------------ | 4020 //------------------------------------------------------------------------------ |
3999 UnicodeSet *RegexCompile::scanProp() { | 4021 UnicodeSet *RegexCompile::scanProp() { |
4000 UnicodeSet *uset = NULL; | 4022 UnicodeSet *uset = NULL; |
4001 | 4023 |
4002 if (U_FAILURE(*fStatus)) { | 4024 if (U_FAILURE(*fStatus)) { |
4003 return NULL; | 4025 return NULL; |
4004 } | 4026 } |
| 4027 (void)chLowerP; // Suppress compiler unused variable warning. |
4005 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); | 4028 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); |
4006 UBool negated = (fC.fChar == chP); | 4029 UBool negated = (fC.fChar == chP); |
4007 | 4030 |
4008 UnicodeString propertyName; | 4031 UnicodeString propertyName; |
4009 nextChar(fC); | 4032 nextChar(fC); |
4010 if (fC.fChar != chLBrace) { | 4033 if (fC.fChar != chLBrace) { |
4011 error(U_REGEX_PROPERTY_SYNTAX); | 4034 error(U_REGEX_PROPERTY_SYNTAX); |
4012 return NULL; | 4035 return NULL; |
4013 } | 4036 } |
4014 for (;;) { | 4037 for (;;) { |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4064 UBool savedInBackslashQuote = fInBackslashQuote; | 4087 UBool savedInBackslashQuote = fInBackslashQuote; |
4065 UBool savedEOLComments = fEOLComments; | 4088 UBool savedEOLComments = fEOLComments; |
4066 int64_t savedLineNum = fLineNum; | 4089 int64_t savedLineNum = fLineNum; |
4067 int64_t savedCharNum = fCharNum; | 4090 int64_t savedCharNum = fCharNum; |
4068 UChar32 savedLastChar = fLastChar; | 4091 UChar32 savedLastChar = fLastChar; |
4069 UChar32 savedPeekChar = fPeekChar; | 4092 UChar32 savedPeekChar = fPeekChar; |
4070 RegexPatternChar savedfC = fC; | 4093 RegexPatternChar savedfC = fC; |
4071 | 4094 |
4072 // Scan for a closing ]. A little tricky because there are some perverse | 4095 // Scan for a closing ]. A little tricky because there are some perverse |
4073 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre
ssion, | 4096 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre
ssion, |
4074 // ending on the second closing ]. | 4097 // ending on the second closing ]. |
4075 | 4098 |
4076 UnicodeString propName; | 4099 UnicodeString propName; |
4077 UBool negated = FALSE; | 4100 UBool negated = FALSE; |
4078 | 4101 |
4079 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette
r:] | 4102 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette
r:] |
4080 nextChar(fC); | 4103 nextChar(fC); |
4081 if (fC.fChar == chUp) { | 4104 if (fC.fChar == chUp) { |
4082 negated = TRUE; | 4105 negated = TRUE; |
4083 nextChar(fC); | 4106 nextChar(fC); |
4084 } | 4107 } |
4085 | 4108 |
4086 // Scan for the closing ":]", collecting the property name along the way. | 4109 // Scan for the closing ":]", collecting the property name along the way. |
4087 UBool sawPropSetTerminator = FALSE; | 4110 UBool sawPropSetTerminator = FALSE; |
4088 for (;;) { | 4111 for (;;) { |
4089 propName.append(fC.fChar); | 4112 propName.append(fC.fChar); |
4090 nextChar(fC); | 4113 nextChar(fC); |
4091 if (fC.fQuoted || fC.fChar == -1) { | 4114 if (fC.fQuoted || fC.fChar == -1) { |
4092 // Escaped characters or end of input - either says this isn't a [:P
roperty:] | 4115 // Escaped characters or end of input - either says this isn't a [:P
roperty:] |
4093 break; | 4116 break; |
4094 } | 4117 } |
4095 if (fC.fChar == chColon) { | 4118 if (fC.fChar == chColon) { |
4096 nextChar(fC); | 4119 nextChar(fC); |
4097 if (fC.fChar == chRBracket) { | 4120 if (fC.fChar == chRBracket) { |
4098 sawPropSetTerminator = TRUE; | 4121 sawPropSetTerminator = TRUE; |
4099 } | 4122 } |
4100 break; | 4123 break; |
4101 } | 4124 } |
4102 } | 4125 } |
4103 | 4126 |
4104 if (sawPropSetTerminator) { | 4127 if (sawPropSetTerminator) { |
4105 uset = createSetForProperty(propName, negated); | 4128 uset = createSetForProperty(propName, negated); |
4106 } | 4129 } |
4107 else | 4130 else |
4108 { | 4131 { |
4109 // No closing ":]". | 4132 // No closing ":]". |
4110 // Restore the original scan position. | 4133 // Restore the original scan position. |
4111 // The main scanner will retry the input as a normal set expression, | 4134 // The main scanner will retry the input as a normal set expression, |
4112 // not a [:Property:] expression. | 4135 // not a [:Property:] expression. |
4113 fScanIndex = savedScanIndex; | 4136 fScanIndex = savedScanIndex; |
(...skipping 12 matching lines...) Expand all Loading... |
4126 | 4149 |
4127 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { | 4150 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { |
4128 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); | 4151 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); |
4129 addCategory(set, U_GC_CF_MASK, ec); | 4152 addCategory(set, U_GC_CF_MASK, ec); |
4130 } | 4153 } |
4131 | 4154 |
4132 // | 4155 // |
4133 // Create a Unicode Set from a Unicode Property expression. | 4156 // Create a Unicode Set from a Unicode Property expression. |
4134 // This is common code underlying both \p{...} ane [:...:] expressions. | 4157 // This is common code underlying both \p{...} ane [:...:] expressions. |
4135 // Includes trying the Java "properties" that aren't supported as | 4158 // Includes trying the Java "properties" that aren't supported as |
4136 // normal ICU UnicodeSet properties | 4159 // normal ICU UnicodeSet properties |
4137 // | 4160 // |
4138 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" | 4161 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" |
4139 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" | 4162 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" |
4140 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
ool negated) { | 4163 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
ool negated) { |
4141 UnicodeString setExpr; | 4164 UnicodeString setExpr; |
4142 UnicodeSet *set; | 4165 UnicodeSet *set; |
4143 uint32_t usetFlags = 0; | 4166 uint32_t usetFlags = 0; |
4144 | 4167 |
4145 if (U_FAILURE(*fStatus)) { | 4168 if (U_FAILURE(*fStatus)) { |
4146 return NULL; | 4169 return NULL; |
4147 } | 4170 } |
4148 | 4171 |
4149 // | 4172 // |
4150 // First try the property as we received it | 4173 // First try the property as we received it |
4151 // | 4174 // |
4152 if (negated) { | 4175 if (negated) { |
4153 setExpr.append(negSetPrefix, -1); | 4176 setExpr.append(negSetPrefix, -1); |
4154 } else { | 4177 } else { |
4155 setExpr.append(posSetPrefix, -1); | 4178 setExpr.append(posSetPrefix, -1); |
4156 } | 4179 } |
4157 setExpr.append(propName); | 4180 setExpr.append(propName); |
4158 setExpr.append(chRBrace); | 4181 setExpr.append(chRBrace); |
4159 setExpr.append(chRBracket); | 4182 setExpr.append(chRBracket); |
4160 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | 4183 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
4161 usetFlags |= USET_CASE_INSENSITIVE; | 4184 usetFlags |= USET_CASE_INSENSITIVE; |
4162 } | 4185 } |
4163 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); | 4186 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); |
4164 if (U_SUCCESS(*fStatus)) { | 4187 if (U_SUCCESS(*fStatus)) { |
4165 return set; | 4188 return set; |
4166 } | 4189 } |
4167 delete set; | 4190 delete set; |
4168 set = NULL; | 4191 set = NULL; |
4169 | 4192 |
4170 // | 4193 // |
4171 // The property as it was didn't work. | 4194 // The property as it was didn't work. |
4172 | 4195 |
4173 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n
ot standard POSIX | 4196 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n
ot standard POSIX |
4174 // or standard Java, but many other regular expression packages do recog
nize it. | 4197 // or standard Java, but many other regular expression packages do recog
nize it. |
4175 | 4198 |
4176 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { | 4199 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { |
4177 *fStatus = U_ZERO_ERROR; | 4200 *fStatus = U_ZERO_ERROR; |
4178 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); | 4201 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); |
4179 if (set == NULL) { | 4202 if (set == NULL) { |
4180 *fStatus = U_MEMORY_ALLOCATION_ERROR; | 4203 *fStatus = U_MEMORY_ALLOCATION_ERROR; |
4181 return set; | 4204 return set; |
4182 } | 4205 } |
4183 if (negated) { | 4206 if (negated) { |
4184 set->complement(); | 4207 set->complement(); |
4185 } | 4208 } |
4186 return set; | 4209 return set; |
4187 } | 4210 } |
4188 | 4211 |
4189 | 4212 |
4190 // Do Java fixes - | 4213 // Do Java fixes - |
4191 // InGreek -> InGreek or Coptic, that being the official Unicode name
for that block. | 4214 // InGreek -> InGreek or Coptic, that being the official Unicode name
for that block. |
4192 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols
. | 4215 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols
. |
4193 // | 4216 // |
4194 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin
g Marks for Symbols" | 4217 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin
g Marks for Symbols" |
4195 // is accepted by Java. The property part of the nam
e is compared | 4218 // is accepted by Java. The property part of the nam
e is compared |
4196 // case-insenstively. The spaces must be exactly as
shown, either | 4219 // case-insenstively. The spaces must be exactly as
shown, either |
4197 // all there, or all omitted, with exactly one at eac
h position | 4220 // all there, or all omitted, with exactly one at eac
h position |
4198 // if they are present. From checking against JDK 1.
6 | 4221 // if they are present. From checking against JDK 1.
6 |
4199 // | 4222 // |
4200 // This code should be removed when ICU properties support the Java c
ompatibility names | 4223 // This code should be removed when ICU properties support the Java c
ompatibility names |
4201 // (ICU 4.0?) | 4224 // (ICU 4.0?) |
4202 // | 4225 // |
4203 UnicodeString mPropName = propName; | 4226 UnicodeString mPropName = propName; |
4204 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { | 4227 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { |
4205 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); | 4228 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); |
4206 } | 4229 } |
4207 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo
ls"), 0) == 0 || | 4230 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo
ls"), 0) == 0 || |
4208 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"
), 0) == 0) { | 4231 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"
), 0) == 0) { |
4209 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym
bols"); | 4232 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym
bols"); |
4210 } | 4233 } |
4211 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { | 4234 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { |
4212 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); | 4235 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); |
4213 } | 4236 } |
4214 | 4237 |
4215 // See if the property looks like a Java "InBlockName", which | 4238 // See if the property looks like a Java "InBlockName", which |
4216 // we will recast as "Block=BlockName" | 4239 // we will recast as "Block=BlockName" |
4217 // | 4240 // |
4218 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" | 4241 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" |
4219 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "
Block=" | 4242 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "
Block=" |
4220 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { | 4243 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { |
4221 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" | 4244 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" |
4222 setExpr.append(BLOCK, -1); | 4245 setExpr.append(BLOCK, -1); |
4223 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi
ng "In" removed. | 4246 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi
ng "In" removed. |
4224 setExpr.append(chRBrace); | 4247 setExpr.append(chRBrace); |
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4328 } | 4351 } |
4329 if (negated) { | 4352 if (negated) { |
4330 set->complement(); | 4353 set->complement(); |
4331 } | 4354 } |
4332 return set; | 4355 return set; |
4333 } | 4356 } |
4334 delete set; | 4357 delete set; |
4335 set = NULL; | 4358 set = NULL; |
4336 } | 4359 } |
4337 error(*fStatus); | 4360 error(*fStatus); |
4338 return NULL; | 4361 return NULL; |
4339 } | 4362 } |
4340 | 4363 |
4341 | 4364 |
4342 | 4365 |
4343 // | 4366 // |
4344 // SetEval Part of the evaluation of [set expressions]. | 4367 // SetEval Part of the evaluation of [set expressions]. |
4345 // Perform any pending (stacked) operations with precedence | 4368 // Perform any pending (stacked) operations with precedence |
4346 // equal or greater to that of the next operator encountered | 4369 // equal or greater to that of the next operator encountered |
4347 // in the expression. | 4370 // in the expression. |
4348 // | 4371 // |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4396 | 4419 |
4397 void RegexCompile::setPushOp(int32_t op) { | 4420 void RegexCompile::setPushOp(int32_t op) { |
4398 setEval(op); | 4421 setEval(op); |
4399 fSetOpStack.push(op, *fStatus); | 4422 fSetOpStack.push(op, *fStatus); |
4400 fSetStack.push(new UnicodeSet(), *fStatus); | 4423 fSetStack.push(new UnicodeSet(), *fStatus); |
4401 } | 4424 } |
4402 | 4425 |
4403 U_NAMESPACE_END | 4426 U_NAMESPACE_END |
4404 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 4427 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
4405 | 4428 |
OLD | NEW |