Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(236)

Side by Side Diff: source/i18n/regexcmp.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/regexcmp.h ('k') | source/i18n/regeximp.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // 1 //
2 // file: regexcmp.cpp 2 // file: regexcmp.cpp
3 // 3 //
4 // Copyright (C) 2002-2013 International Business Machines Corporation and othe rs. 4 // Copyright (C) 2002-2014 International Business Machines Corporation and othe rs.
5 // All Rights Reserved. 5 // All Rights Reserved.
6 // 6 //
7 // This file contains the ICU regular expression compiler, which is responsible 7 // This file contains the ICU regular expression compiler, which is responsible
8 // for processing a regular expression pattern into the compiled form that 8 // for processing a regular expression pattern into the compiled form that
9 // is used by the match finding engine. 9 // is used by the match finding engine.
10 // 10 //
11 11
12 #include "unicode/utypes.h" 12 #include "unicode/utypes.h"
13 13
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 15
16 #include "unicode/ustring.h" 16 #include "unicode/ustring.h"
17 #include "unicode/unistr.h" 17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h" 18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h" 19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h" 20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h" 21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h" 22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h" 23 #include "unicode/regex.h"
24 #include "unicode/utf.h" 24 #include "unicode/utf.h"
25 #include "unicode/utf16.h" 25 #include "unicode/utf16.h"
26 #include "patternprops.h" 26 #include "patternprops.h"
27 #include "putilimp.h" 27 #include "putilimp.h"
28 #include "cmemory.h" 28 #include "cmemory.h"
29 #include "cstring.h" 29 #include "cstring.h"
30 #include "uvectr32.h" 30 #include "uvectr32.h"
31 #include "uvectr64.h" 31 #include "uvectr64.h"
32 #include "uassert.h" 32 #include "uassert.h"
33 #include "ucln_in.h"
34 #include "uinvchar.h" 33 #include "uinvchar.h"
35 34
36 #include "regeximp.h" 35 #include "regeximp.h"
37 #include "regexcst.h" // Contains state table for the regex pattern parser. 36 #include "regexcst.h" // Contains state table for the regex pattern parser.
38 // generated by a Perl script. 37 // generated by a Perl script.
39 #include "regexcmp.h" 38 #include "regexcmp.h"
40 #include "regexst.h" 39 #include "regexst.h"
41 #include "regextxt.h" 40 #include "regextxt.h"
42 41
43 42
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
102 // 101 //
103 //------------------------------------------------------------------------------ 102 //------------------------------------------------------------------------------
104 void RegexCompile::compile( 103 void RegexCompile::compile(
105 const UnicodeString &pat, // Source pat to be compile d. 104 const UnicodeString &pat, // Source pat to be compile d.
106 UParseError &pp, // Error position info 105 UParseError &pp, // Error position info
107 UErrorCode &e) // Error Code 106 UErrorCode &e) // Error Code
108 { 107 {
109 fRXPat->fPatternString = new UnicodeString(pat); 108 fRXPat->fPatternString = new UnicodeString(pat);
110 UText patternText = UTEXT_INITIALIZER; 109 UText patternText = UTEXT_INITIALIZER;
111 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 110 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
112 111
113 if (U_SUCCESS(e)) { 112 if (U_SUCCESS(e)) {
114 compile(&patternText, pp, e); 113 compile(&patternText, pp, e);
115 utext_close(&patternText); 114 utext_close(&patternText);
116 } 115 }
117 } 116 }
118 117
119 // 118 //
120 // compile, UText mode 119 // compile, UText mode
121 // All the work is actually done here. 120 // All the work is actually done here.
122 // 121 //
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
295 n *= 10; 294 n *= 10;
296 } 295 }
297 296
298 // 297 //
299 // The pattern's fFrameSize so far has accumulated the requirements for 298 // The pattern's fFrameSize so far has accumulated the requirements for
300 // storage for capture parentheses, counters, etc. that are encountered 299 // storage for capture parentheses, counters, etc. that are encountered
301 // in the pattern. Add space for the two variables that are always 300 // in the pattern. Add space for the two variables that are always
302 // present in the saved state: the input string position (int64_t) and 301 // present in the saved state: the input string position (int64_t) and
303 // the position in the compiled pattern. 302 // the position in the compiled pattern.
304 // 303 //
305 allocateStackData(RESTACKFRAME_HDRCOUNT); 304 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT;
306 305
307 // 306 //
308 // Optimization pass 1: NOPs, back-references, and case-folding 307 // Optimization pass 1: NOPs, back-references, and case-folding
309 // 308 //
310 stripNOPs(); 309 stripNOPs();
311 310
312 // 311 //
313 // Get bounds for the minimum and maximum length of a string that this 312 // Get bounds for the minimum and maximum length of a string that this
314 // pattern can match. Used to avoid looking for matches in strings that 313 // pattern can match. Used to avoid looking for matches in strings that
315 // are too short. 314 // are too short.
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
361 360
362 case doPatStart: 361 case doPatStart:
363 // Start of pattern compiles to: 362 // Start of pattern compiles to:
364 //0 SAVE 2 Fall back to position of FAIL 363 //0 SAVE 2 Fall back to position of FAIL
365 //1 jmp 3 364 //1 jmp 3
366 //2 FAIL Stop if we ever reach here. 365 //2 FAIL Stop if we ever reach here.
367 //3 NOP Dummy, so start of pattern looks the same as 366 //3 NOP Dummy, so start of pattern looks the same as
368 // the start of an ( grouping. 367 // the start of an ( grouping.
369 //4 NOP Resreved, will be replaced by a save if there are 368 //4 NOP Resreved, will be replaced by a save if there are
370 // OR | operators at the top level 369 // OR | operators at the top level
371 appendOp(URX_BUILD(URX_STATE_SAVE, 2)); 370 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus) ;
372 appendOp(URX_BUILD(URX_JMP, 3)); 371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
373 appendOp(URX_BUILD(URX_FAIL, 0)); 372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
374 373
375 // Standard open nonCapture paren action emits the two NOPs and 374 // Standard open nonCapture paren action emits the two NOPs and
376 // sets up the paren stack frame. 375 // sets up the paren stack frame.
377 doParseActions(doOpenNonCaptureParen); 376 doParseActions(doOpenNonCaptureParen);
378 break; 377 break;
379 378
380 case doPatFinish: 379 case doPatFinish:
381 // We've scanned to the end of the pattern 380 // We've scanned to the end of the pattern
382 // The end of pattern compiles to: 381 // The end of pattern compiles to:
383 // URX_END 382 // URX_END
384 // which will stop the runtime match engine. 383 // which will stop the runtime match engine.
385 // Encountering end of pattern also behaves like a close paren, 384 // Encountering end of pattern also behaves like a close paren,
386 // and forces fixups of the State Save at the beginning of the compile d pattern 385 // and forces fixups of the State Save at the beginning of the compile d pattern
387 // and of any OR operations at the top level. 386 // and of any OR operations at the top level.
388 // 387 //
389 handleCloseParen(); 388 handleCloseParen();
390 if (fParenStack.size() > 0) { 389 if (fParenStack.size() > 0) {
391 // Missing close paren in pattern. 390 // Missing close paren in pattern.
392 error(U_REGEX_MISMATCHED_PAREN); 391 error(U_REGEX_MISMATCHED_PAREN);
393 } 392 }
394 393
395 // add the END operation to the compiled pattern. 394 // add the END operation to the compiled pattern.
396 appendOp(URX_BUILD(URX_END, 0)); 395 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
397 396
398 // Terminate the pattern compilation state machine. 397 // Terminate the pattern compilation state machine.
399 returnVal = FALSE; 398 returnVal = FALSE;
400 break; 399 break;
401 400
402 401
403 402
404 case doOrOperator: 403 case doOrOperator:
405 // Scanning a '|', as in (A|B) 404 // Scanning a '|', as in (A|B)
406 { 405 {
407 // Generate code for any pending literals preceding the '|' 406 // Generate code for any pending literals preceding the '|'
408 fixLiterals(FALSE); 407 fixLiterals(FALSE);
409 408
410 // Insert a SAVE operation at the start of the pattern section prece ding 409 // Insert a SAVE operation at the start of the pattern section prece ding
411 // this OR at this level. This SAVE will branch the match forward 410 // this OR at this level. This SAVE will branch the match forward
412 // to the right hand side of the OR in the event that the left han d 411 // to the right hand side of the OR in the event that the left han d
413 // side fails to match and backtracks. Locate the position for th e 412 // side fails to match and backtracks. Locate the position for th e
414 // save from the location on the top of the parentheses stack. 413 // save from the location on the top of the parentheses stack.
415 int32_t savePosition = fParenStack.popi(); 414 int32_t savePosition = fParenStack.popi();
416 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition) ; 415 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition) ;
417 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 416 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
418 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 417 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
419 fRXPat->fCompiledPat->setElementAt(op, savePosition); 418 fRXPat->fCompiledPat->setElementAt(op, savePosition);
420 419
421 // Append an JMP operation into the compiled pattern. The operand f or 420 // Append an JMP operation into the compiled pattern. The operand f or
422 // the JMP will eventually be the location following the ')' for th e 421 // the JMP will eventually be the location following the ')' for th e
423 // group. This will be patched in later, when the ')' is encounter ed. 422 // group. This will be patched in later, when the ')' is encounter ed.
424 op = URX_BUILD(URX_JMP, 0); 423 op = URX_BUILD(URX_JMP, 0);
425 appendOp(op); 424 fRXPat->fCompiledPat->addElement(op, *fStatus);
426 425
427 // Push the position of the newly added JMP op onto the parentheses stack. 426 // Push the position of the newly added JMP op onto the parentheses stack.
428 // This registers if for fixup when this block's close paren is enco untered. 427 // This registers if for fixup when this block's close paren is enco untered.
429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
430 429
431 // Append a NOP to the compiled pattern. This is the slot reserved 430 // Append a NOP to the compiled pattern. This is the slot reserved
432 // for a SAVE in the event that there is yet another '|' following 431 // for a SAVE in the event that there is yet another '|' following
433 // this one. 432 // this one.
434 appendOp(URX_BUILD(URX_NOP, 0)); 433 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
435 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 434 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
436 } 435 }
437 break; 436 break;
438 437
439 438
440 case doOpenCaptureParen: 439 case doOpenCaptureParen:
441 // Open Paren. 440 // Open Paren.
442 // Compile to a 441 // Compile to a
443 // - NOP, which later may be replaced by a save-state if the 442 // - NOP, which later may be replaced by a save-state if the
444 // parenthesized group gets a * quantifier, followed by 443 // parenthesized group gets a * quantifier, followed by
445 // - START_CAPTURE n where n is stack frame offset to the captu re group variables. 444 // - START_CAPTURE n where n is stack frame offset to the captu re group variables.
446 // - NOP, which may later be replaced by a save-state if there 445 // - NOP, which may later be replaced by a save-state if there
447 // is an '|' alternation within the parens. 446 // is an '|' alternation within the parens.
448 // 447 //
449 // Each capture group gets three slots in the save stack frame: 448 // Each capture group gets three slots in the save stack frame:
450 // 0: Capture Group start position (in input string being matche d.) 449 // 0: Capture Group start position (in input string being matche d.)
451 // 1: Capture Group end position. 450 // 1: Capture Group end position.
452 // 2: Start of Match-in-progress. 451 // 2: Start of Match-in-progress.
453 // The first two locations are for a completed capture group, and are 452 // The first two locations are for a completed capture group, and are
454 // referred to by back references and the like. 453 // referred to by back references and the like.
455 // The third location stores the capture start position when an START _CAPTURE is 454 // The third location stores the capture start position when an START _CAPTURE is
456 // encountered. This will be promoted to a completed capture when (and if) the corresponding 455 // encountered. This will be promoted to a completed capture when (and if) the corresponding
457 // END_CAPTURE is encountered. 456 // END_CAPTURE is encountered.
458 { 457 {
459 fixLiterals(); 458 fixLiterals();
460 appendOp(URX_BUILD(URX_NOP, 0)); 459 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
461 int32_t varsLoc = allocateStackData(3); // Reserve three slots i n match stack frame. 460 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); 461 fRXPat->fFrameSize += 3;
463 appendOp(cop); 462 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
464 appendOp(URX_BUILD(URX_NOP, 0)); 463 fRXPat->fCompiledPat->addElement(cop, *fStatus);
464 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
465 465
466 // On the Parentheses stack, start a new frame and add the postions 466 // On the Parentheses stack, start a new frame and add the postions
467 // of the two NOPs. Depending on what follows in the pattern, the 467 // of the two NOPs. Depending on what follows in the pattern, the
468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 468 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
469 // address of the end of the parenthesized group. 469 // address of the end of the parenthesized group.
470 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 470 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
471 fParenStack.push(capturing, *fStatus); // Fra me type. 471 fParenStack.push(capturing, *fStatus); // Fra me type.
472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 472 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 473 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
474 474
475 // Save the mapping from group number to stack frame variable positi on. 475 // Save the mapping from group number to stack frame variable positi on.
476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 476 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
477 } 477 }
478 break; 478 break;
479 479
480 case doOpenNonCaptureParen: 480 case doOpenNonCaptureParen:
481 // Open non-caputuring (grouping only) Paren. 481 // Open non-caputuring (grouping only) Paren.
482 // Compile to a 482 // Compile to a
483 // - NOP, which later may be replaced by a save-state if the 483 // - NOP, which later may be replaced by a save-state if the
484 // parenthesized group gets a * quantifier, followed by 484 // parenthesized group gets a * quantifier, followed by
485 // - NOP, which may later be replaced by a save-state if there 485 // - NOP, which may later be replaced by a save-state if there
486 // is an '|' alternation within the parens. 486 // is an '|' alternation within the parens.
487 { 487 {
488 fixLiterals(); 488 fixLiterals();
489 appendOp(URX_BUILD(URX_NOP, 0)); 489 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
490 appendOp(URX_BUILD(URX_NOP, 0)); 490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
491 491
492 // On the Parentheses stack, start a new frame and add the postions 492 // On the Parentheses stack, start a new frame and add the postions
493 // of the two NOPs. 493 // of the two NOPs.
494 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 494 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
495 fParenStack.push(plain, *fStatus); // Beg in a new frame. 495 fParenStack.push(plain, *fStatus); // Beg in a new frame.
496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 496 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 497 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
498 } 498 }
499 break; 499 break;
500 500
501 501
502 case doOpenAtomicParen: 502 case doOpenAtomicParen:
503 // Open Atomic Paren. (?> 503 // Open Atomic Paren. (?>
504 // Compile to a 504 // Compile to a
505 // - NOP, which later may be replaced if the parenthesized group 505 // - NOP, which later may be replaced if the parenthesized group
506 // has a quantifier, followed by 506 // has a quantifier, followed by
507 // - STO_SP save state stack position, so it can be restored at th e ")" 507 // - STO_SP save state stack position, so it can be restored at th e ")"
508 // - NOP, which may later be replaced by a save-state if there 508 // - NOP, which may later be replaced by a save-state if there
509 // is an '|' alternation within the parens. 509 // is an '|' alternation within the parens.
510 { 510 {
511 fixLiterals(); 511 fixLiterals();
512 appendOp(URX_BUILD(URX_NOP, 0)); 512 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
513 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr. 513 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio n for saving the
514 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); 514 fRXPat->fDataSize += 1; // state stack ptr.
515 appendOp(stoOp); 515 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
516 appendOp(URX_BUILD(URX_NOP, 0)); 516 fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
517 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
517 518
518 // On the Parentheses stack, start a new frame and add the postions 519 // On the Parentheses stack, start a new frame and add the postions
519 // of the two NOPs. Depending on what follows in the pattern, the 520 // of the two NOPs. Depending on what follows in the pattern, the
520 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 521 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
521 // address of the end of the parenthesized group. 522 // address of the end of the parenthesized group.
522 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 523 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
523 fParenStack.push(atomic, *fStatus); // Fra me type. 524 fParenStack.push(atomic, *fStatus); // Fra me type.
524 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 525 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
525 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
526 } 527 }
(...skipping 22 matching lines...) Expand all
549 // 6. NOP reserved for use by quantifiers on the block . 550 // 6. NOP reserved for use by quantifiers on the block .
550 // Look-ahead can't have quantifiers, but paren stack 551 // Look-ahead can't have quantifiers, but paren stack
551 // compile time conventions require the slot anyhow. 552 // compile time conventions require the slot anyhow.
552 // 7. NOP may be replaced if there is are '|' ops in t he block. 553 // 7. NOP may be replaced if there is are '|' ops in t he block.
553 // 8. code for parenthesized stuff. 554 // 8. code for parenthesized stuff.
554 // 9. LA_END 555 // 9. LA_END
555 // 556 //
556 // Two data slots are reserved, for saving the stack ptr and the input position. 557 // Two data slots are reserved, for saving the stack ptr and the input position.
557 { 558 {
558 fixLiterals(); 559 fixLiterals();
559 int32_t dataLoc = allocateData(2); 560 int32_t dataLoc = fRXPat->fDataSize;
561 fRXPat->fDataSize += 2;
560 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 562 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
561 appendOp(op); 563 fRXPat->fCompiledPat->addElement(op, *fStatus);
562 564
563 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 565 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
564 appendOp(op); 566 fRXPat->fCompiledPat->addElement(op, *fStatus);
565 567
566 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 568 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
567 appendOp(op); 569 fRXPat->fCompiledPat->addElement(op, *fStatus);
568 570
569 op = URX_BUILD(URX_LA_END, dataLoc); 571 op = URX_BUILD(URX_LA_END, dataLoc);
570 appendOp(op); 572 fRXPat->fCompiledPat->addElement(op, *fStatus);
571 573
572 op = URX_BUILD(URX_BACKTRACK, 0); 574 op = URX_BUILD(URX_BACKTRACK, 0);
573 appendOp(op); 575 fRXPat->fCompiledPat->addElement(op, *fStatus);
574 576
575 op = URX_BUILD(URX_NOP, 0); 577 op = URX_BUILD(URX_NOP, 0);
576 appendOp(op); 578 fRXPat->fCompiledPat->addElement(op, *fStatus);
577 appendOp(op); 579 fRXPat->fCompiledPat->addElement(op, *fStatus);
578 580
579 // On the Parentheses stack, start a new frame and add the postions 581 // On the Parentheses stack, start a new frame and add the postions
580 // of the NOPs. 582 // of the NOPs.
581 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 583 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
582 fParenStack.push(lookAhead, *fStatus); // Fra me type. 584 fParenStack.push(lookAhead, *fStatus); // Fra me type.
583 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 585 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
584 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 586 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
585 } 587 }
586 break; 588 break;
587 589
588 case doOpenLookAheadNeg: 590 case doOpenLookAheadNeg:
589 // Negated Lookahead. (?! stuff ) 591 // Negated Lookahead. (?! stuff )
590 // Compiles to 592 // Compiles to
591 // 1. START_LA dataloc 593 // 1. START_LA dataloc
592 // 2. SAVE_STATE 7 // Fail within look-ahead block restor es to this state, 594 // 2. SAVE_STATE 7 // Fail within look-ahead block restor es to this state,
593 // // which continues with the match. 595 // // which continues with the match.
594 // 3. NOP // Std. Open Paren sequence, for possi ble '|' 596 // 3. NOP // Std. Open Paren sequence, for possi ble '|'
595 // 4. code for parenthesized stuff. 597 // 4. code for parenthesized stuff.
596 // 5. END_LA // Cut back stack, remove saved state from step 2. 598 // 5. END_LA // Cut back stack, remove saved state from step 2.
597 // 6. BACKTRACK // code in block succeeded, so neg. lo okahead fails. 599 // 6. BACKTRACK // code in block succeeded, so neg. lo okahead fails.
598 // 7. END_LA // Restore match region, in case look- ahead was using 600 // 7. END_LA // Restore match region, in case look- ahead was using
599 // an alternate (transparent) reg ion. 601 // an alternate (transparent) reg ion.
600 { 602 {
601 fixLiterals(); 603 fixLiterals();
602 int32_t dataLoc = allocateData(2); 604 int32_t dataLoc = fRXPat->fDataSize;
605 fRXPat->fDataSize += 2;
603 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 606 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
604 appendOp(op); 607 fRXPat->fCompiledPat->addElement(op, *fStatus);
605 608
606 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche d later. 609 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patche d later.
607 appendOp(op); 610 fRXPat->fCompiledPat->addElement(op, *fStatus);
608 611
609 op = URX_BUILD(URX_NOP, 0); 612 op = URX_BUILD(URX_NOP, 0);
610 appendOp(op); 613 fRXPat->fCompiledPat->addElement(op, *fStatus);
611 614
612 // On the Parentheses stack, start a new frame and add the postions 615 // On the Parentheses stack, start a new frame and add the postions
613 // of the StateSave and NOP. 616 // of the StateSave and NOP.
614 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 617 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
615 fParenStack.push(negLookAhead, *fStatus); // Fram e type 618 fParenStack.push(negLookAhead, *fStatus); // Fram e type
616 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 619 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
617 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 620 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
618 621
619 // Instructions #5 - #7 will be added when the ')' is encountered. 622 // Instructions #5 - #7 will be added when the ')' is encountered.
620 } 623 }
(...skipping 17 matching lines...) Expand all
638 // Allocate a block of matcher data, to contain (when runni ng a match) 641 // Allocate a block of matcher data, to contain (when runni ng a match)
639 // 0: Stack ptr on entry 642 // 0: Stack ptr on entry
640 // 1: Input Index on entry 643 // 1: Input Index on entry
641 // 2: Start index of match current match attempt. 644 // 2: Start index of match current match attempt.
642 // 3: Original Input String len. 645 // 3: Original Input String len.
643 646
644 // Generate match code for any pending literals. 647 // Generate match code for any pending literals.
645 fixLiterals(); 648 fixLiterals();
646 649
647 // Allocate data space 650 // Allocate data space
648 int32_t dataLoc = allocateData(4); 651 int32_t dataLoc = fRXPat->fDataSize;
652 fRXPat->fDataSize += 4;
649 653
650 // Emit URX_LB_START 654 // Emit URX_LB_START
651 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 655 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
652 appendOp(op); 656 fRXPat->fCompiledPat->addElement(op, *fStatus);
653 657
654 // Emit URX_LB_CONT 658 // Emit URX_LB_CONT
655 op = URX_BUILD(URX_LB_CONT, dataLoc); 659 op = URX_BUILD(URX_LB_CONT, dataLoc);
656 appendOp(op); 660 fRXPat->fCompiledPat->addElement(op, *fStatus);
657 appendOp(0); // MinMatchLength. To be filled later. 661 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength . To be filled later.
658 appendOp(0); // MaxMatchLength. To be filled later. 662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength . To be filled later.
659 663
660 // Emit the NOP 664 // Emit the NOP
661 op = URX_BUILD(URX_NOP, 0); 665 op = URX_BUILD(URX_NOP, 0);
662 appendOp(op); 666 fRXPat->fCompiledPat->addElement(op, *fStatus);
663 appendOp(op); 667 fRXPat->fCompiledPat->addElement(op, *fStatus);
664 668
665 // On the Parentheses stack, start a new frame and add the postions 669 // On the Parentheses stack, start a new frame and add the postions
666 // of the URX_LB_CONT and the NOP. 670 // of the URX_LB_CONT and the NOP.
667 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 671 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
668 fParenStack.push(lookBehind, *fStatus); // Fra me type 672 fParenStack.push(lookBehind, *fStatus); // Fra me type
669 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 673 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
670 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 674 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
671 675
672 // The final two instructions will be added when the ')' is encounte red. 676 // The final two instructions will be added when the ')' is encounte red.
673 } 677 }
(...skipping 19 matching lines...) Expand all
693 // Allocate a block of matcher data, to contain (when runni ng a match) 697 // Allocate a block of matcher data, to contain (when runni ng a match)
694 // 0: Stack ptr on entry 698 // 0: Stack ptr on entry
695 // 1: Input Index on entry 699 // 1: Input Index on entry
696 // 2: Start index of match current match attempt. 700 // 2: Start index of match current match attempt.
697 // 3: Original Input String len. 701 // 3: Original Input String len.
698 702
699 // Generate match code for any pending literals. 703 // Generate match code for any pending literals.
700 fixLiterals(); 704 fixLiterals();
701 705
702 // Allocate data space 706 // Allocate data space
703 int32_t dataLoc = allocateData(4); 707 int32_t dataLoc = fRXPat->fDataSize;
708 fRXPat->fDataSize += 4;
704 709
705 // Emit URX_LB_START 710 // Emit URX_LB_START
706 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 711 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
707 appendOp(op); 712 fRXPat->fCompiledPat->addElement(op, *fStatus);
708 713
709 // Emit URX_LBN_CONT 714 // Emit URX_LBN_CONT
710 op = URX_BUILD(URX_LBN_CONT, dataLoc); 715 op = URX_BUILD(URX_LBN_CONT, dataLoc);
711 appendOp(op); 716 fRXPat->fCompiledPat->addElement(op, *fStatus);
712 appendOp(0); // MinMatchLength. To be filled later. 717 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength . To be filled later.
713 appendOp(0); // MaxMatchLength. To be filled later. 718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength . To be filled later.
714 appendOp(0); // Continue Loc. To be filled later. 719 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
715 720
716 // Emit the NOP 721 // Emit the NOP
717 op = URX_BUILD(URX_NOP, 0); 722 op = URX_BUILD(URX_NOP, 0);
718 appendOp(op); 723 fRXPat->fCompiledPat->addElement(op, *fStatus);
719 appendOp(op); 724 fRXPat->fCompiledPat->addElement(op, *fStatus);
720 725
721 // On the Parentheses stack, start a new frame and add the postions 726 // On the Parentheses stack, start a new frame and add the postions
722 // of the URX_LB_CONT and the NOP. 727 // of the URX_LB_CONT and the NOP.
723 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state 728 fParenStack.push(fModeFlags, *fStatus); // Mat ch mode state
724 fParenStack.push(lookBehindN, *fStatus); // Fra me type 729 fParenStack.push(lookBehindN, *fStatus); // Fra me type
725 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 730 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
726 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 731 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
727 732
728 // The final two instructions will be added when the ')' is encounte red. 733 // The final two instructions will be added when the ')' is encounte red.
729 } 734 }
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
780 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 785 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
781 int32_t frameLoc; 786 int32_t frameLoc;
782 787
783 // Check for simple constructs, which may get special optimized code . 788 // Check for simple constructs, which may get special optimized code .
784 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 789 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
785 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t opLoc); 790 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t opLoc);
786 791
787 if (URX_TYPE(repeatedOp) == URX_SETREF) { 792 if (URX_TYPE(repeatedOp) == URX_SETREF) {
788 // Emit optimized code for [char set]+ 793 // Emit optimized code for [char set]+
789 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO p)); 794 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO p));
790 appendOp(loopOpI); 795 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
791 frameLoc = allocateStackData(1); 796 frameLoc = fRXPat->fFrameSize;
797 fRXPat->fFrameSize++;
792 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 798 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
793 appendOp(loopOpC); 799 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
794 break; 800 break;
795 } 801 }
796 802
797 if (URX_TYPE(repeatedOp) == URX_DOTANY || 803 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
798 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 804 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
799 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 805 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
800 // Emit Optimized code for .+ operations. 806 // Emit Optimized code for .+ operations.
801 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 807 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
802 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 808 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
803 // URX_LOOP_DOT_I operand is a flag indicating ". matche s any" mode. 809 // URX_LOOP_DOT_I operand is a flag indicating ". matche s any" mode.
804 loopOpI |= 1; 810 loopOpI |= 1;
805 } 811 }
806 if (fModeFlags & UREGEX_UNIX_LINES) { 812 if (fModeFlags & UREGEX_UNIX_LINES) {
807 loopOpI |= 2; 813 loopOpI |= 2;
808 } 814 }
809 appendOp(loopOpI); 815 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
810 frameLoc = allocateStackData(1); 816 frameLoc = fRXPat->fFrameSize;
817 fRXPat->fFrameSize++;
811 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 818 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
812 appendOp(loopOpC); 819 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
813 break; 820 break;
814 } 821 }
815 822
816 } 823 }
817 824
818 // General case. 825 // General case.
819 826
820 // Check for minimum match length of zero, which requires 827 // Check for minimum match length of zero, which requires
821 // extra loop-breaking code. 828 // extra loop-breaking code.
822 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 829 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
823 // Zero length match is possible. 830 // Zero length match is possible.
824 // Emit the code sequence that can handle it. 831 // Emit the code sequence that can handle it.
825 insertOp(topLoc); 832 insertOp(topLoc);
826 frameLoc = allocateStackData(1); 833 frameLoc = fRXPat->fFrameSize;
834 fRXPat->fFrameSize++;
827 835
828 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); 836 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
829 fRXPat->fCompiledPat->setElementAt(op, topLoc); 837 fRXPat->fCompiledPat->setElementAt(op, topLoc);
830 838
831 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); 839 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
832 appendOp(op); 840 fRXPat->fCompiledPat->addElement(op, *fStatus);
833 } else { 841 } else {
834 // Simpler code when the repeated body must match something non- empty 842 // Simpler code when the repeated body must match something non- empty
835 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); 843 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
836 appendOp(jmpOp); 844 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
837 } 845 }
838 } 846 }
839 break; 847 break;
840 848
841 case doNGPlus: 849 case doNGPlus:
842 // Non-greedy '+?' compiles to 850 // Non-greedy '+?' compiles to
843 // 1. stuff to be repeated (already built) 851 // 1. stuff to be repeated (already built)
844 // 2. state-save 1 852 // 2. state-save 1
845 // 3. ... 853 // 3. ...
846 { 854 {
847 int32_t topLoc = blockTopLoc(FALSE); 855 int32_t topLoc = blockTopLoc(FALSE);
848 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); 856 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
849 appendOp(saveStateOp); 857 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
850 } 858 }
851 break; 859 break;
852 860
853 861
854 case doOpt: 862 case doOpt:
855 // Normal (greedy) ? quantifier. 863 // Normal (greedy) ? quantifier.
856 // Compiles to 864 // Compiles to
857 // 1. state save 3 865 // 1. state save 3
858 // 2. body of optional block 866 // 2. body of optional block
859 // 3. ... 867 // 3. ...
(...skipping 16 matching lines...) Expand all
876 // This code is less than ideal, with two jmps instead of one, because we can only 884 // This code is less than ideal, with two jmps instead of one, because we can only
877 // insert one instruction at the top of the block being iterated. 885 // insert one instruction at the top of the block being iterated.
878 { 886 {
879 int32_t jmp1_loc = blockTopLoc(TRUE); 887 int32_t jmp1_loc = blockTopLoc(TRUE);
880 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 888 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
881 889
882 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); 890 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1);
883 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 891 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
884 892
885 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); 893 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
886 appendOp(jmp2_op); 894 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
887 895
888 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); 896 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
889 appendOp(save_op); 897 fRXPat->fCompiledPat->addElement(save_op, *fStatus);
890 } 898 }
891 break; 899 break;
892 900
893 901
894 case doStar: 902 case doStar:
895 // Normal (greedy) * quantifier. 903 // Normal (greedy) * quantifier.
896 // Compiles to 904 // Compiles to
897 // 1. STATE_SAVE 4 905 // 1. STATE_SAVE 4
898 // 2. body of stuff being iterated over 906 // 2. body of stuff being iterated over
899 // 3. JMP_SAV 2 907 // 3. JMP_SAV 2
(...skipping 21 matching lines...) Expand all
921 929
922 // Check for simple *, where the construct being repeated 930 // Check for simple *, where the construct being repeated
923 // compiled to single opcode, and might be optimizable. 931 // compiled to single opcode, and might be optimizable.
924 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 932 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
925 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t opLoc); 933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(t opLoc);
926 934
927 if (URX_TYPE(repeatedOp) == URX_SETREF) { 935 if (URX_TYPE(repeatedOp) == URX_SETREF) {
928 // Emit optimized code for a [char set]* 936 // Emit optimized code for a [char set]*
929 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO p)); 937 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedO p));
930 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
931 dataLoc = allocateStackData(1); 939 dataLoc = fRXPat->fFrameSize;
940 fRXPat->fFrameSize++;
932 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 941 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
933 appendOp(loopOpC); 942 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
934 break; 943 break;
935 } 944 }
936 945
937 if (URX_TYPE(repeatedOp) == URX_DOTANY || 946 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
938 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 947 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
939 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 948 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
940 // Emit Optimized code for .* operations. 949 // Emit Optimized code for .* operations.
941 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 950 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
942 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 951 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
943 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 952 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
944 loopOpI |= 1; 953 loopOpI |= 1;
945 } 954 }
946 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 955 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
947 loopOpI |= 2; 956 loopOpI |= 2;
948 } 957 }
949 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 958 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
950 dataLoc = allocateStackData(1); 959 dataLoc = fRXPat->fFrameSize;
960 fRXPat->fFrameSize++;
951 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 961 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
952 appendOp(loopOpC); 962 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
953 break; 963 break;
954 } 964 }
955 } 965 }
956 966
957 // Emit general case code for this * 967 // Emit general case code for this *
958 // The optimizations did not apply. 968 // The optimizations did not apply.
959 969
960 int32_t saveStateLoc = blockTopLoc(TRUE); 970 int32_t saveStateLoc = blockTopLoc(TRUE);
961 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); 971 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
962 972
963 // Check for minimum match length of zero, which requires 973 // Check for minimum match length of zero, which requires
964 // extra loop-breaking code. 974 // extra loop-breaking code.
965 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 975 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
966 insertOp(saveStateLoc); 976 insertOp(saveStateLoc);
967 dataLoc = allocateStackData(1); 977 dataLoc = fRXPat->fFrameSize;
978 fRXPat->fFrameSize++;
968 979
969 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); 980 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
970 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 981 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
971 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); 982 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
972 } 983 }
973 984
974 // Locate the position in the compiled pattern where the match will continue 985 // Locate the position in the compiled pattern where the match will continue
975 // after completing the *. (4 or 5 in the comment above) 986 // after completing the *. (4 or 5 in the comment above)
976 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 987 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
977 988
978 // Put together the save state op store it into the compiled code. 989 // Put together the save state op store it into the compiled code.
979 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); 990 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
980 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 991 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
981 992
982 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt ern. 993 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled patt ern.
983 appendOp(jmpOp); 994 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
984 } 995 }
985 break; 996 break;
986 997
987 case doNGStar: 998 case doNGStar:
988 // Non-greedy *? quantifier 999 // Non-greedy *? quantifier
989 // compiles to 1000 // compiles to
990 // 1. JMP 3 1001 // 1. JMP 3
991 // 2. body of stuff being iterated over 1002 // 2. body of stuff being iterated over
992 // 3. STATE_SAVE 2 1003 // 3. STATE_SAVE 2
993 // 4 ... 1004 // 4 ...
994 { 1005 {
995 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1 . 1006 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1 .
996 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3 . 1007 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3 .
997 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); 1008 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
998 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); 1009 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
999 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 1010 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000 appendOp(stateSaveOp); 1011 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
1001 } 1012 }
1002 break; 1013 break;
1003 1014
1004 1015
1005 case doIntervalInit: 1016 case doIntervalInit:
1006 // The '{' opening an interval quantifier was just scanned. 1017 // The '{' opening an interval quantifier was just scanned.
1007 // Init the counter varaiables that will accumulate the values as the di gits 1018 // Init the counter varaiables that will accumulate the values as the di gits
1008 // are scanned. 1019 // are scanned.
1009 fIntervalLow = 0; 1020 fIntervalLow = 0;
1010 fIntervalUpper = -1; 1021 fIntervalUpper = -1;
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
1059 int32_t topLoc = blockTopLoc(FALSE); 1070 int32_t topLoc = blockTopLoc(FALSE);
1060 1071
1061 // Produce normal looping code. 1072 // Produce normal looping code.
1062 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1073 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1063 1074
1064 // Surround the just-emitted normal looping code with a STO_SP ... L D_SP 1075 // Surround the just-emitted normal looping code with a STO_SP ... L D_SP
1065 // just as if the loop was inclosed in atomic parentheses. 1076 // just as if the loop was inclosed in atomic parentheses.
1066 1077
1067 // First the STO_SP before the start of the loop 1078 // First the STO_SP before the start of the loop
1068 insertOp(topLoc); 1079 insertOp(topLoc);
1069 1080 int32_t varLoc = fRXPat->fDataSize; // Reserve a data locatio n for saving the
1070 int32_t varLoc = allocateData(1); // Reserve a data location for saving the 1081 fRXPat->fDataSize += 1; // state stack ptr.
1071 int32_t op = URX_BUILD(URX_STO_SP, varLoc); 1082 int32_t op = URX_BUILD(URX_STO_SP, varLoc);
1072 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1083 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1073 1084
1074 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1085 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1075 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL oc); 1086 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topL oc);
1076 loopOp++; // point LoopOp after the just-inserted STO_SP 1087 loopOp++; // point LoopOp after the just-inserted STO_SP
1077 fRXPat->fCompiledPat->push(loopOp, *fStatus); 1088 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1078 1089
1079 // Then the LD_SP after the end of the loop 1090 // Then the LD_SP after the end of the loop
1080 op = URX_BUILD(URX_LD_SP, varLoc); 1091 op = URX_BUILD(URX_LD_SP, varLoc);
1081 appendOp(op); 1092 fRXPat->fCompiledPat->addElement(op, *fStatus);
1082 } 1093 }
1083 1094
1084 break; 1095 break;
1085 1096
1086 case doNGInterval: 1097 case doNGInterval:
1087 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1098 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1088 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1099 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1089 break; 1100 break;
1090 1101
1091 case doIntervalError: 1102 case doIntervalError:
(...skipping 23 matching lines...) Expand all
1115 { 1126 {
1116 fixLiterals(FALSE); 1127 fixLiterals(FALSE);
1117 int32_t op; 1128 int32_t op;
1118 if (fModeFlags & UREGEX_DOTALL) { 1129 if (fModeFlags & UREGEX_DOTALL) {
1119 op = URX_BUILD(URX_DOTANY_ALL, 0); 1130 op = URX_BUILD(URX_DOTANY_ALL, 0);
1120 } else if (fModeFlags & UREGEX_UNIX_LINES) { 1131 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1121 op = URX_BUILD(URX_DOTANY_UNIX, 0); 1132 op = URX_BUILD(URX_DOTANY_UNIX, 0);
1122 } else { 1133 } else {
1123 op = URX_BUILD(URX_DOTANY, 0); 1134 op = URX_BUILD(URX_DOTANY, 0);
1124 } 1135 }
1125 appendOp(op); 1136 fRXPat->fCompiledPat->addElement(op, *fStatus);
1126 } 1137 }
1127 break; 1138 break;
1128 1139
1129 case doCaret: 1140 case doCaret:
1130 { 1141 {
1131 fixLiterals(FALSE); 1142 fixLiterals(FALSE);
1132 int32_t op = 0; 1143 int32_t op = 0;
1133 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) { 1144 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) {
1134 op = URX_CARET; 1145 op = URX_CARET;
1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) { 1146 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) {
1136 op = URX_CARET_M; 1147 op = URX_CARET_M;
1137 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) { 1148 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) {
1138 op = URX_CARET; // Only testing true start of input. 1149 op = URX_CARET; // Only testing true start of input.
1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) { 1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) {
1140 op = URX_CARET_M_UNIX; 1151 op = URX_CARET_M_UNIX;
1141 } 1152 }
1142 appendOp(URX_BUILD(op, 0)); 1153 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1143 } 1154 }
1144 break; 1155 break;
1145 1156
1146 case doDollar: 1157 case doDollar:
1147 { 1158 {
1148 fixLiterals(FALSE); 1159 fixLiterals(FALSE);
1149 int32_t op = 0; 1160 int32_t op = 0;
1150 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) { 1161 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) {
1151 op = URX_DOLLAR; 1162 op = URX_DOLLAR;
1152 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) { 1163 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) == 0) {
1153 op = URX_DOLLAR_M; 1164 op = URX_DOLLAR_M;
1154 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) { 1165 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) {
1155 op = URX_DOLLAR_D; 1166 op = URX_DOLLAR_D;
1156 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) { 1167 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & URE GEX_UNIX_LINES) != 0) {
1157 op = URX_DOLLAR_MD; 1168 op = URX_DOLLAR_MD;
1158 } 1169 }
1159 appendOp(URX_BUILD(op, 0)); 1170 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1160 } 1171 }
1161 break; 1172 break;
1162 1173
1163 case doBackslashA: 1174 case doBackslashA:
1164 fixLiterals(FALSE); 1175 fixLiterals(FALSE);
1165 appendOp(URX_BUILD(URX_CARET, 0)); 1176 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
1166 break; 1177 break;
1167 1178
1168 case doBackslashB: 1179 case doBackslashB:
1169 { 1180 {
1170 #if UCONFIG_NO_BREAK_ITERATION==1 1181 #if UCONFIG_NO_BREAK_ITERATION==1
1171 if (fModeFlags & UREGEX_UWORD) { 1182 if (fModeFlags & UREGEX_UWORD) {
1172 error(U_UNSUPPORTED_ERROR); 1183 error(U_UNSUPPORTED_ERROR);
1173 } 1184 }
1174 #endif 1185 #endif
1175 fixLiterals(FALSE); 1186 fixLiterals(FALSE);
1176 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC KSLASH_B; 1187 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC KSLASH_B;
1177 appendOp(URX_BUILD(op, 1)); 1188 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
1178 } 1189 }
1179 break; 1190 break;
1180 1191
1181 case doBackslashb: 1192 case doBackslashb:
1182 { 1193 {
1183 #if UCONFIG_NO_BREAK_ITERATION==1 1194 #if UCONFIG_NO_BREAK_ITERATION==1
1184 if (fModeFlags & UREGEX_UWORD) { 1195 if (fModeFlags & UREGEX_UWORD) {
1185 error(U_UNSUPPORTED_ERROR); 1196 error(U_UNSUPPORTED_ERROR);
1186 } 1197 }
1187 #endif 1198 #endif
1188 fixLiterals(FALSE); 1199 fixLiterals(FALSE);
1189 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC KSLASH_B; 1200 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BAC KSLASH_B;
1190 appendOp(URX_BUILD(op, 0)); 1201 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1191 } 1202 }
1192 break; 1203 break;
1193 1204
1194 case doBackslashD: 1205 case doBackslashD:
1195 fixLiterals(FALSE); 1206 fixLiterals(FALSE);
1196 appendOp(URX_BUILD(URX_BACKSLASH_D, 1)); 1207 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus );
1197 break; 1208 break;
1198 1209
1199 case doBackslashd: 1210 case doBackslashd:
1200 fixLiterals(FALSE); 1211 fixLiterals(FALSE);
1201 appendOp(URX_BUILD(URX_BACKSLASH_D, 0)); 1212 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus );
1202 break; 1213 break;
1203 1214
1204 case doBackslashG: 1215 case doBackslashG:
1205 fixLiterals(FALSE); 1216 fixLiterals(FALSE);
1206 appendOp(URX_BUILD(URX_BACKSLASH_G, 0)); 1217 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus );
1207 break; 1218 break;
1208 1219
1209 case doBackslashS: 1220 case doBackslashS:
1210 fixLiterals(FALSE); 1221 fixLiterals(FALSE);
1211 appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET)); 1222 fRXPat->fCompiledPat->addElement(
1223 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
1212 break; 1224 break;
1213 1225
1214 case doBackslashs: 1226 case doBackslashs:
1215 fixLiterals(FALSE); 1227 fixLiterals(FALSE);
1216 appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET)); 1228 fRXPat->fCompiledPat->addElement(
1229 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
1217 break; 1230 break;
1218 1231
1219 case doBackslashW: 1232 case doBackslashW:
1220 fixLiterals(FALSE); 1233 fixLiterals(FALSE);
1221 appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET)); 1234 fRXPat->fCompiledPat->addElement(
1235 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
1222 break; 1236 break;
1223 1237
1224 case doBackslashw: 1238 case doBackslashw:
1225 fixLiterals(FALSE); 1239 fixLiterals(FALSE);
1226 appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET)); 1240 fRXPat->fCompiledPat->addElement(
1241 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
1227 break; 1242 break;
1228 1243
1229 case doBackslashX: 1244 case doBackslashX:
1230 fixLiterals(FALSE); 1245 fixLiterals(FALSE);
1231 appendOp(URX_BUILD(URX_BACKSLASH_X, 0)); 1246 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus );
1232 break; 1247 break;
1233 1248
1234 1249
1235 case doBackslashZ: 1250 case doBackslashZ:
1236 fixLiterals(FALSE); 1251 fixLiterals(FALSE);
1237 appendOp(URX_BUILD(URX_DOLLAR, 0)); 1252 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
1238 break; 1253 break;
1239 1254
1240 case doBackslashz: 1255 case doBackslashz:
1241 fixLiterals(FALSE); 1256 fixLiterals(FALSE);
1242 appendOp(URX_BUILD(URX_BACKSLASH_Z, 0)); 1257 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus );
1243 break; 1258 break;
1244 1259
1245 case doEscapeError: 1260 case doEscapeError:
1246 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1261 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1247 break; 1262 break;
1248 1263
1249 case doExit: 1264 case doExit:
1250 fixLiterals(FALSE); 1265 fixLiterals(FALSE);
1251 returnVal = FALSE; 1266 returnVal = FALSE;
1252 break; 1267 break;
1253 1268
1254 case doProperty: 1269 case doProperty:
1255 { 1270 {
1256 fixLiterals(FALSE); 1271 fixLiterals(FALSE);
1257 UnicodeSet *theSet = scanProp(); 1272 UnicodeSet *theSet = scanProp();
1258 compileSet(theSet); 1273 compileSet(theSet);
1259 } 1274 }
1260 break; 1275 break;
1261 1276
1262 case doNamedChar: 1277 case doNamedChar:
1263 { 1278 {
1264 UChar32 c = scanNamedChar(); 1279 UChar32 c = scanNamedChar();
1265 literalChar(c); 1280 literalChar(c);
1266 } 1281 }
1267 break; 1282 break;
1268 1283
1269 1284
1270 case doBackRef: 1285 case doBackRef:
1271 // BackReference. Somewhat unusual in that the front-end can not comple tely parse 1286 // BackReference. Somewhat unusual in that the front-end can not comple tely parse
1272 // the regular expression, because the number of digits to be consumed 1287 // the regular expression, because the number of digits to be consumed
1273 // depends on the number of capture groups that have bee n defined. So 1288 // depends on the number of capture groups that have bee n defined. So
1274 // we have to do it here instead. 1289 // we have to do it here instead.
1275 { 1290 {
1276 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1291 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1277 int32_t groupNum = 0; 1292 int32_t groupNum = 0;
1278 UChar32 c = fC.fChar; 1293 UChar32 c = fC.fChar;
(...skipping 19 matching lines...) Expand all
1298 // of compilation, it will be changed to the variable's location. 1313 // of compilation, it will be changed to the variable's location.
1299 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, 1314 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1300 // and shouldn't enter this code path at all. 1315 // and shouldn't enter this code path at all.
1301 fixLiterals(FALSE); 1316 fixLiterals(FALSE);
1302 int32_t op; 1317 int32_t op;
1303 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1318 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1304 op = URX_BUILD(URX_BACKREF_I, groupNum); 1319 op = URX_BUILD(URX_BACKREF_I, groupNum);
1305 } else { 1320 } else {
1306 op = URX_BUILD(URX_BACKREF, groupNum); 1321 op = URX_BUILD(URX_BACKREF, groupNum);
1307 } 1322 }
1308 appendOp(op); 1323 fRXPat->fCompiledPat->addElement(op, *fStatus);
1309 } 1324 }
1310 break; 1325 break;
1311 1326
1312 1327
1313 case doPossessivePlus: 1328 case doPossessivePlus:
1314 // Possessive ++ quantifier. 1329 // Possessive ++ quantifier.
1315 // Compiles to 1330 // Compiles to
1316 // 1. STO_SP 1331 // 1. STO_SP
1317 // 2. body of stuff being iterated over 1332 // 2. body of stuff being iterated over
1318 // 3. STATE_SAVE 5 1333 // 3. STATE_SAVE 5
1319 // 4. JMP 2 1334 // 4. JMP 2
1320 // 5. LD_SP 1335 // 5. LD_SP
1321 // 6. ... 1336 // 6. ...
1322 // 1337 //
1323 // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1338 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1324 // then unconditionally discarded. Perhaps introduce a n ew opcode. Ticket 6056 1339 // then unconditionally discarded. Perhaps introduce a n ew opcode. Ticket 6056
1325 // 1340 //
1326 { 1341 {
1327 // Emit the STO_SP 1342 // Emit the STO_SP
1328 int32_t topLoc = blockTopLoc(TRUE); 1343 int32_t topLoc = blockTopLoc(TRUE);
1329 int32_t stoLoc = allocateData(1); // Reserve the data location fo r storing save stack ptr. 1344 int32_t stoLoc = fRXPat->fDataSize;
1345 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1330 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1346 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1331 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1347 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1332 1348
1333 // Emit the STATE_SAVE 1349 // Emit the STATE_SAVE
1334 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1350 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1335 appendOp(op); 1351 fRXPat->fCompiledPat->addElement(op, *fStatus);
1336 1352
1337 // Emit the JMP 1353 // Emit the JMP
1338 op = URX_BUILD(URX_JMP, topLoc+1); 1354 op = URX_BUILD(URX_JMP, topLoc+1);
1339 appendOp(op); 1355 fRXPat->fCompiledPat->addElement(op, *fStatus);
1340 1356
1341 // Emit the LD_SP 1357 // Emit the LD_SP
1342 op = URX_BUILD(URX_LD_SP, stoLoc); 1358 op = URX_BUILD(URX_LD_SP, stoLoc);
1343 appendOp(op); 1359 fRXPat->fCompiledPat->addElement(op, *fStatus);
1344 } 1360 }
1345 break; 1361 break;
1346 1362
1347 case doPossessiveStar: 1363 case doPossessiveStar:
1348 // Possessive *+ quantifier. 1364 // Possessive *+ quantifier.
1349 // Compiles to 1365 // Compiles to
1350 // 1. STO_SP loc 1366 // 1. STO_SP loc
1351 // 2. STATE_SAVE 5 1367 // 2. STATE_SAVE 5
1352 // 3. body of stuff being iterated over 1368 // 3. body of stuff being iterated over
1353 // 4. JMP 2 1369 // 4. JMP 2
1354 // 5. LD_SP loc 1370 // 5. LD_SP loc
1355 // 6 ... 1371 // 6 ...
1356 // TODO: do something to cut back the state stack each time through the loop. 1372 // TODO: do something to cut back the state stack each time through the loop.
1357 { 1373 {
1358 // Reserve two slots at the top of the block. 1374 // Reserve two slots at the top of the block.
1359 int32_t topLoc = blockTopLoc(TRUE); 1375 int32_t topLoc = blockTopLoc(TRUE);
1360 insertOp(topLoc); 1376 insertOp(topLoc);
1361 1377
1362 // emit STO_SP loc 1378 // emit STO_SP loc
1363 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1379 int32_t stoLoc = fRXPat->fDataSize;
1380 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1364 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1381 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1365 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1382 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1366 1383
1367 // Emit the SAVE_STATE 5 1384 // Emit the SAVE_STATE 5
1368 int32_t L7 = fRXPat->fCompiledPat->size()+1; 1385 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1369 op = URX_BUILD(URX_STATE_SAVE, L7); 1386 op = URX_BUILD(URX_STATE_SAVE, L7);
1370 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1387 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1371 1388
1372 // Append the JMP operation. 1389 // Append the JMP operation.
1373 op = URX_BUILD(URX_JMP, topLoc+1); 1390 op = URX_BUILD(URX_JMP, topLoc+1);
1374 appendOp(op); 1391 fRXPat->fCompiledPat->addElement(op, *fStatus);
1375 1392
1376 // Emit the LD_SP loc 1393 // Emit the LD_SP loc
1377 op = URX_BUILD(URX_LD_SP, stoLoc); 1394 op = URX_BUILD(URX_LD_SP, stoLoc);
1378 appendOp(op); 1395 fRXPat->fCompiledPat->addElement(op, *fStatus);
1379 } 1396 }
1380 break; 1397 break;
1381 1398
1382 case doPossessiveOpt: 1399 case doPossessiveOpt:
1383 // Possessive ?+ quantifier. 1400 // Possessive ?+ quantifier.
1384 // Compiles to 1401 // Compiles to
1385 // 1. STO_SP loc 1402 // 1. STO_SP loc
1386 // 2. SAVE_STATE 5 1403 // 2. SAVE_STATE 5
1387 // 3. body of optional block 1404 // 3. body of optional block
1388 // 4. LD_SP loc 1405 // 4. LD_SP loc
1389 // 5. ... 1406 // 5. ...
1390 // 1407 //
1391 { 1408 {
1392 // Reserve two slots at the top of the block. 1409 // Reserve two slots at the top of the block.
1393 int32_t topLoc = blockTopLoc(TRUE); 1410 int32_t topLoc = blockTopLoc(TRUE);
1394 insertOp(topLoc); 1411 insertOp(topLoc);
1395 1412
1396 // Emit the STO_SP 1413 // Emit the STO_SP
1397 int32_t stoLoc = allocateData(1); // Reserve the data location f or storing save stack ptr. 1414 int32_t stoLoc = fRXPat->fDataSize;
1415 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
1398 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1416 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
1399 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1417 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1400 1418
1401 // Emit the SAVE_STATE 1419 // Emit the SAVE_STATE
1402 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1420 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1403 op = URX_BUILD(URX_STATE_SAVE, continueLoc); 1421 op = URX_BUILD(URX_STATE_SAVE, continueLoc);
1404 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1422 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1405 1423
1406 // Emit the LD_SP 1424 // Emit the LD_SP
1407 op = URX_BUILD(URX_LD_SP, stoLoc); 1425 op = URX_BUILD(URX_LD_SP, stoLoc);
1408 appendOp(op); 1426 fRXPat->fCompiledPat->addElement(op, *fStatus);
1409 } 1427 }
1410 break; 1428 break;
1411 1429
1412 1430
1413 case doBeginMatchMode: 1431 case doBeginMatchMode:
1414 fNewModeFlags = fModeFlags; 1432 fNewModeFlags = fModeFlags;
1415 fSetModeFlag = TRUE; 1433 fSetModeFlag = TRUE;
1416 break; 1434 break;
1417 1435
1418 case doMatchMode: // (?i) and similar 1436 case doMatchMode: // (?i) and similar
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
1455 // We've got a (?i: or similar. Begin a parenthesized block, save old 1473 // We've got a (?i: or similar. Begin a parenthesized block, save old
1456 // mode flags so they can be restored at the close of the block. 1474 // mode flags so they can be restored at the close of the block.
1457 // 1475 //
1458 // Compile to a 1476 // Compile to a
1459 // - NOP, which later may be replaced by a save-state if the 1477 // - NOP, which later may be replaced by a save-state if the
1460 // parenthesized group gets a * quantifier, followed by 1478 // parenthesized group gets a * quantifier, followed by
1461 // - NOP, which may later be replaced by a save-state if there 1479 // - NOP, which may later be replaced by a save-state if there
1462 // is an '|' alternation within the parens. 1480 // is an '|' alternation within the parens.
1463 { 1481 {
1464 fixLiterals(FALSE); 1482 fixLiterals(FALSE);
1465 appendOp(URX_BUILD(URX_NOP, 0)); 1483 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1466 appendOp(URX_BUILD(URX_NOP, 0)); 1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1467 1485
1468 // On the Parentheses stack, start a new frame and add the postions 1486 // On the Parentheses stack, start a new frame and add the postions
1469 // of the two NOPs (a normal non-capturing () frame, except for th e 1487 // of the two NOPs (a normal non-capturing () frame, except for th e
1470 // saving of the orignal mode flags.) 1488 // saving of the orignal mode flags.)
1471 fParenStack.push(fModeFlags, *fStatus); 1489 fParenStack.push(fModeFlags, *fStatus);
1472 fParenStack.push(flags, *fStatus); // Fra me Marker 1490 fParenStack.push(flags, *fStatus); // Fra me Marker
1473 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1491 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1474 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1492 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1475 1493
1476 // Set the current mode flags to the new values. 1494 // Set the current mode flags to the new values.
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after
1617 // Finished a complete set expression, including all nested sets. 1635 // Finished a complete set expression, including all nested sets.
1618 // The close bracket has already triggered clearing out pending set op erators, 1636 // The close bracket has already triggered clearing out pending set op erators,
1619 // the operator stack should be empty and the operand stack should ha ve just 1637 // the operator stack should be empty and the operand stack should ha ve just
1620 // one entry, the result set. 1638 // one entry, the result set.
1621 U_ASSERT(fSetOpStack.empty()); 1639 U_ASSERT(fSetOpStack.empty());
1622 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1640 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1623 U_ASSERT(fSetStack.empty()); 1641 U_ASSERT(fSetStack.empty());
1624 compileSet(theSet); 1642 compileSet(theSet);
1625 break; 1643 break;
1626 } 1644 }
1627 1645
1628 case doSetIntersection2: 1646 case doSetIntersection2:
1629 // Have scanned something like [abc&& 1647 // Have scanned something like [abc&&
1630 setPushOp(setIntersection2); 1648 setPushOp(setIntersection2);
1631 break; 1649 break;
1632 1650
1633 case doSetLiteral: 1651 case doSetLiteral:
1634 // Union the just-scanned literal character into the set being built. 1652 // Union the just-scanned literal character into the set being built.
1635 // This operation is the highest precedence set operation, so we can always do 1653 // This operation is the highest precedence set operation, so we can always do
1636 // it immediately, without waiting to see what follows. It is necess ary to perform 1654 // it immediately, without waiting to see what follows. It is necess ary to perform
1637 // any pending '-' or '&' operation first, because these have the sam e precedence 1655 // any pending '-' or '&' operation first, because these have the sam e precedence
1638 // as union-ing in a literal' 1656 // as union-ing in a literal'
1639 { 1657 {
1640 setEval(setUnion); 1658 setEval(setUnion);
1641 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1659 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1642 s->add(fC.fChar); 1660 s->add(fC.fChar);
1643 fLastSetLiteral = fC.fChar; 1661 fLastSetLiteral = fC.fChar;
1644 break; 1662 break;
1645 } 1663 }
1646 1664
1647 case doSetLiteralEscaped: 1665 case doSetLiteralEscaped:
1648 // A back-slash escaped literal character was encountered. 1666 // A back-slash escaped literal character was encountered.
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
1723 case doSetPosixProp: 1741 case doSetPosixProp:
1724 { 1742 {
1725 UnicodeSet *s = scanPosixProp(); 1743 UnicodeSet *s = scanPosixProp();
1726 if (s != NULL) { 1744 if (s != NULL) {
1727 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1745 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1728 tos->addAll(*s); 1746 tos->addAll(*s);
1729 delete s; 1747 delete s;
1730 } // else error. scanProp() reported the error status already. 1748 } // else error. scanProp() reported the error status already.
1731 } 1749 }
1732 break; 1750 break;
1733 1751
1734 case doSetProp: 1752 case doSetProp:
1735 // Scanned a \p \P within [brackets]. 1753 // Scanned a \p \P within [brackets].
1736 { 1754 {
1737 UnicodeSet *s = scanProp(); 1755 UnicodeSet *s = scanProp();
1738 if (s != NULL) { 1756 if (s != NULL) {
1739 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1757 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1740 tos->addAll(*s); 1758 tos->addAll(*s);
1741 delete s; 1759 delete s;
1742 } // else error. scanProp() reported the error status already. 1760 } // else error. scanProp() reported the error status already.
1743 } 1761 }
1744 break; 1762 break;
1745 1763
1746 1764
1747 case doSetRange: 1765 case doSetRange:
1748 // We have scanned literal-literal. Add the range to the set. 1766 // We have scanned literal-literal. Add the range to the set.
1749 // The left character is already in the set, and is saved in fLastSetLit eral. 1767 // The left character is already in the set, and is saved in fLastSetLit eral.
1750 // The right side is the current character. 1768 // The right side is the current character.
1751 // Lower Limit > Upper limit being an error matches both Java 1769 // Lower Limit > Upper limit being an error matches both Java
1752 // and ICU UnicodeSet behavior. 1770 // and ICU UnicodeSet behavior.
1753 { 1771 {
1754 if (fLastSetLiteral > fC.fChar) { 1772 if (fLastSetLiteral > fC.fChar) {
1755 error(U_REGEX_INVALID_RANGE); 1773 error(U_REGEX_INVALID_RANGE);
1756 } 1774 }
1757 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1775 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1758 s->add(fLastSetLiteral, fC.fChar); 1776 s->add(fLastSetLiteral, fC.fChar);
1759 break; 1777 break;
1760 } 1778 }
1761 1779
1762 default: 1780 default:
1763 U_ASSERT(FALSE); 1781 U_ASSERT(FALSE);
1764 error(U_REGEX_INTERNAL_ERROR); 1782 error(U_REGEX_INTERNAL_ERROR);
1765 break; 1783 break;
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
1804 1822
1805 // If no literal characters have been scanned but not yet had code generated 1823 // If no literal characters have been scanned but not yet had code generated
1806 // for them, nothing needs to be done. 1824 // for them, nothing needs to be done.
1807 if (fLiteralChars.length() == 0) { 1825 if (fLiteralChars.length() == 0) {
1808 return; 1826 return;
1809 } 1827 }
1810 1828
1811 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt h(), -1); 1829 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.lengt h(), -1);
1812 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1830 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1813 1831
1814 // Split: We need to ensure that the last item in the compiled pattern 1832 // Split: We need to ensure that the last item in the compiled pattern
1815 // refers only to the last literal scanned in the pattern, so that 1833 // refers only to the last literal scanned in the pattern, so that
1816 // quantifiers (*, +, etc.) affect only it, and not a longer string. 1834 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1817 // Split before case folding for case insensitive matches. 1835 // Split before case folding for case insensitive matches.
1818 1836
1819 if (split) { 1837 if (split) {
1820 fLiteralChars.truncate(indexOfLastCodePoint); 1838 fLiteralChars.truncate(indexOfLastCodePoint);
1821 fixLiterals(FALSE); // Recursive call, emit code to match the first pa rt of the string. 1839 fixLiterals(FALSE); // Recursive call, emit code to match the first pa rt of the string.
1822 // Note that the truncated literal string may be empty, in which case 1840 // Note that the truncated literal string may be empty, in which case
1823 // nothing will be emitted. 1841 // nothing will be emitted.
1824 1842
1825 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. 1843 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1826 fixLiterals(FALSE); // Second recursive call, code for the fina l code point. 1844 fixLiterals(FALSE); // Second recursive call, code for the fina l code point.
1827 return; 1845 return;
1828 } 1846 }
1829 1847
1830 // If we are doing case-insensitive matching, case fold the string. This ma y expand 1848 // If we are doing case-insensitive matching, case fold the string. This ma y expand
1831 // the string, e.g. the German sharp-s turns into "ss" 1849 // the string, e.g. the German sharp-s turns into "ss"
1832 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1850 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1833 fLiteralChars.foldCase(); 1851 fLiteralChars.foldCase();
1834 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1852 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1835 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1853 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1836 } 1854 }
1837 1855
1838 if (indexOfLastCodePoint == 0) { 1856 if (indexOfLastCodePoint == 0) {
1839 // Single character, emit a URX_ONECHAR op to match it. 1857 // Single character, emit a URX_ONECHAR op to match it.
1840 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1858 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1841 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { 1859 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1842 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); 1860 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
1843 } else { 1861 } else {
1844 op = URX_BUILD(URX_ONECHAR, lastCodePoint); 1862 op = URX_BUILD(URX_ONECHAR, lastCodePoint);
1845 } 1863 }
1846 appendOp(op); 1864 fRXPat->fCompiledPat->addElement(op, *fStatus);
1847 } else { 1865 } else {
1848 // Two or more chars, emit a URX_STRING to match them. 1866 // Two or more chars, emit a URX_STRING to match them.
1849 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1867 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1850 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); 1868 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length());
1851 } else { 1869 } else {
1852 // TODO here: add optimization to split case sensitive strings of l ength two 1870 // TODO here: add optimization to split case sensitive strings of l ength two
1853 // into two single char ops, for efficiency. 1871 // into two single char ops, for efficiency.
1854 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); 1872 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
1855 } 1873 }
1856 appendOp(op); 1874 fRXPat->fCompiledPat->addElement(op, *fStatus);
1857 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); 1875 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
1858 appendOp(op); 1876 fRXPat->fCompiledPat->addElement(op, *fStatus);
1859 1877
1860 // Add this string into the accumulated strings of the compiled pattern. 1878 // Add this string into the accumulated strings of the compiled pattern.
1861 // The total size of the accumulated strings must be restricted to 24 bi ts because
1862 // string indexes appear as compiled pattern operand values.
1863 // This is the only place that the pattern.fLiteralText string is modifi ed.
1864
1865 fRXPat->fLiteralText.append(fLiteralChars); 1879 fRXPat->fLiteralText.append(fLiteralChars);
1866 if (U_SUCCESS(*fStatus) && fRXPat->fLiteralText.length() > 0x00ffffff) {
1867 *fStatus = U_REGEX_PATTERN_TOO_BIG;
1868 }
1869 } 1880 }
1870 1881
1871 fLiteralChars.remove(); 1882 fLiteralChars.remove();
1872 } 1883 }
1873 1884
1874 1885
1875 //------------------------------------------------------------------------------ 1886
1876 // 1887
1877 // appendOp() Append a new instruction onto the compiled pattern
1878 // Includes error checking, limiting the size of the
1879 // pattern to lengths that can be represented in the
1880 // 24 bit operand field of an instruction.
1881 //
1882 //------------------------------------------------------------------------------
1883 void RegexCompile::appendOp(int32_t op) {
1884 fRXPat->fCompiledPat->addElement(op, *fStatus);
1885 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1886 *fStatus = U_REGEX_PATTERN_TOO_BIG;
1887 }
1888 }
1889 1888
1890 1889
1891 //------------------------------------------------------------------------------ 1890 //------------------------------------------------------------------------------
1892 // 1891 //
1893 // insertOp() Insert a slot for a new opcode into the already 1892 // insertOp() Insert a slot for a new opcode into the already
1894 // compiled pattern code. 1893 // compiled pattern code.
1895 // 1894 //
1896 // Fill the slot with a NOP. Our caller will replace i t 1895 // Fill the slot with a NOP. Our caller will replace i t
1897 // with what they really wanted. 1896 // with what they really wanted.
1898 // 1897 //
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
1940 1939
1941 if (fMatchCloseParen > where) { 1940 if (fMatchCloseParen > where) {
1942 fMatchCloseParen++; 1941 fMatchCloseParen++;
1943 } 1942 }
1944 if (fMatchOpenParen > where) { 1943 if (fMatchOpenParen > where) {
1945 fMatchOpenParen++; 1944 fMatchOpenParen++;
1946 } 1945 }
1947 } 1946 }
1948 1947
1949 1948
1949
1950 //------------------------------------------------------------------------------ 1950 //------------------------------------------------------------------------------
1951 // 1951 //
1952 // allocateData() Allocate storage in the matcher's static data area.
1953 // Return the index for the newly allocated data.
1954 // The storage won't actually exist until we are running a match
1955 // operation, but the storage indexes are inserted into various
1956 // opcodes while compiling the pattern.
1957 //
1958 //------------------------------------------------------------------------------
1959 int32_t RegexCompile::allocateData(int32_t size) {
1960 if (U_FAILURE(*fStatus)) {
1961 return 0;
1962 }
1963 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
1964 *fStatus = U_REGEX_INTERNAL_ERROR;
1965 return 0;
1966 }
1967 int32_t dataIndex = fRXPat->fDataSize;
1968 fRXPat->fDataSize += size;
1969 if (fRXPat->fDataSize >= 0x00fffff0) {
1970 *fStatus = U_REGEX_PATTERN_TOO_BIG;
1971 }
1972 return dataIndex;
1973 }
1974
1975
1976 //------------------------------------------------------------------------------
1977 //
1978 // allocateStackData() Allocate space in the back-tracking stack frame.
1979 // Return the index for the newly allocated data.
1980 // The frame indexes are inserted into various
1981 // opcodes while compiling the pattern, meaning that fra me
1982 // size must be restricted to the size that will fit
1983 // as an operand (24 bits).
1984 //
1985 //------------------------------------------------------------------------------
1986 int32_t RegexCompile::allocateStackData(int32_t size) {
1987 if (U_FAILURE(*fStatus)) {
1988 return 0;
1989 }
1990 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
1991 *fStatus = U_REGEX_INTERNAL_ERROR;
1992 return 0;
1993 }
1994 int32_t dataIndex = fRXPat->fFrameSize;
1995 fRXPat->fFrameSize += size;
1996 if (fRXPat->fFrameSize >= 0x00fffff0) {
1997 *fStatus = U_REGEX_PATTERN_TOO_BIG;
1998 }
1999 return dataIndex;
2000 }
2001
2002
2003 //------------------------------------------------------------------------------
2004 //
2005 // blockTopLoc() Find or create a location in the compiled pattern 1952 // blockTopLoc() Find or create a location in the compiled pattern
2006 // at the start of the operation or block that has 1953 // at the start of the operation or block that has
2007 // just been compiled. Needed when a quantifier (* or 1954 // just been compiled. Needed when a quantifier (* or
2008 // whatever) appears, and we need to add an operation 1955 // whatever) appears, and we need to add an operation
2009 // at the start of the thing being quantified. 1956 // at the start of the thing being quantified.
2010 // 1957 //
2011 // (Parenthesized Blocks) have a slot with a NOP that 1958 // (Parenthesized Blocks) have a slot with a NOP that
2012 // is reserved for this purpose. .* or similar don't 1959 // is reserved for this purpose. .* or similar don't
2013 // and a slot needs to be added. 1960 // and a slot needs to be added.
2014 // 1961 //
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
2110 // Capturing Parentheses. 2057 // Capturing Parentheses.
2111 // Insert a End Capture op into the pattern. 2058 // Insert a End Capture op into the pattern.
2112 // The frame offset of the variables for this cg is obtained from the 2059 // The frame offset of the variables for this cg is obtained from the
2113 // start capture op and put it into the end-capture op. 2060 // start capture op and put it into the end-capture op.
2114 { 2061 {
2115 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat chOpenParen+1); 2062 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMat chOpenParen+1);
2116 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2063 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2117 2064
2118 int32_t frameVarLocation = URX_VAL(captureOp); 2065 int32_t frameVarLocation = URX_VAL(captureOp);
2119 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation ); 2066 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation );
2120 appendOp(endCaptureOp); 2067 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
2121 } 2068 }
2122 break; 2069 break;
2123 case atomic: 2070 case atomic:
2124 // Atomic Parenthesis. 2071 // Atomic Parenthesis.
2125 // Insert a LD_SP operation to restore the state stack to the position 2072 // Insert a LD_SP operation to restore the state stack to the position
2126 // it was when the atomic parens were entered. 2073 // it was when the atomic parens were entered.
2127 { 2074 {
2128 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp enParen+1); 2075 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOp enParen+1);
2129 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2076 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2130 int32_t stoLoc = URX_VAL(stoOp); 2077 int32_t stoLoc = URX_VAL(stoOp);
2131 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); 2078 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
2132 appendOp(ldOp); 2079 fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
2133 } 2080 }
2134 break; 2081 break;
2135 2082
2136 case lookAhead: 2083 case lookAhead:
2137 { 2084 {
2138 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-5); 2085 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-5);
2139 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2086 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2140 int32_t dataLoc = URX_VAL(startOp); 2087 int32_t dataLoc = URX_VAL(startOp);
2141 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2088 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
2142 appendOp(op); 2089 fRXPat->fCompiledPat->addElement(op, *fStatus);
2143 } 2090 }
2144 break; 2091 break;
2145 2092
2146 case negLookAhead: 2093 case negLookAhead:
2147 { 2094 {
2148 // See comment at doOpenLookAheadNeg 2095 // See comment at doOpenLookAheadNeg
2149 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-1); 2096 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-1);
2150 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2097 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2151 int32_t dataLoc = URX_VAL(startOp); 2098 int32_t dataLoc = URX_VAL(startOp);
2152 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2099 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
2153 appendOp(op); 2100 fRXPat->fCompiledPat->addElement(op, *fStatus);
2154 op = URX_BUILD(URX_BACKTRACK, 0); 2101 op = URX_BUILD(URX_BACKTRACK, 0);
2155 appendOp(op); 2102 fRXPat->fCompiledPat->addElement(op, *fStatus);
2156 op = URX_BUILD(URX_LA_END, dataLoc); 2103 op = URX_BUILD(URX_LA_END, dataLoc);
2157 appendOp(op); 2104 fRXPat->fCompiledPat->addElement(op, *fStatus);
2158 2105
2159 // Patch the URX_SAVE near the top of the block. 2106 // Patch the URX_SAVE near the top of the block.
2160 // The destination of the SAVE is the final LA_END that was just add ed. 2107 // The destination of the SAVE is the final LA_END that was just add ed.
2161 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen); 2108 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen);
2162 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2109 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2163 int32_t dest = fRXPat->fCompiledPat->size()-1; 2110 int32_t dest = fRXPat->fCompiledPat->size()-1;
2164 saveOp = URX_BUILD(URX_STATE_SAVE, dest); 2111 saveOp = URX_BUILD(URX_STATE_SAVE, dest);
2165 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2112 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2166 } 2113 }
2167 break; 2114 break;
2168 2115
2169 case lookBehind: 2116 case lookBehind:
2170 { 2117 {
2171 // See comment at doOpenLookBehind. 2118 // See comment at doOpenLookBehind.
2172 2119
2173 // Append the URX_LB_END and URX_LA_END to the compiled pattern. 2120 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2174 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-4); 2121 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-4);
2175 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2122 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2176 int32_t dataLoc = URX_VAL(startOp); 2123 int32_t dataLoc = URX_VAL(startOp);
2177 int32_t op = URX_BUILD(URX_LB_END, dataLoc); 2124 int32_t op = URX_BUILD(URX_LB_END, dataLoc);
2178 appendOp(op); 2125 fRXPat->fCompiledPat->addElement(op, *fStatus);
2179 op = URX_BUILD(URX_LA_END, dataLoc); 2126 op = URX_BUILD(URX_LA_END, dataLoc);
2180 appendOp(op); 2127 fRXPat->fCompiledPat->addElement(op, *fStatus);
2181 2128
2182 // Determine the min and max bounds for the length of the 2129 // Determine the min and max bounds for the length of the
2183 // string that the pattern can match. 2130 // string that the pattern can match.
2184 // An unbounded upper limit is an error. 2131 // An unbounded upper limit is an error.
2185 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2132 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2186 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2133 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2187 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2134 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2188 if (URX_TYPE(maxML) != 0) {
2189 error(U_REGEX_LOOK_BEHIND_LIMIT);
2190 break;
2191 }
2192 if (maxML == INT32_MAX) { 2135 if (maxML == INT32_MAX) {
2193 error(U_REGEX_LOOK_BEHIND_LIMIT); 2136 error(U_REGEX_LOOK_BEHIND_LIMIT);
2194 break; 2137 break;
2195 } 2138 }
2196 U_ASSERT(minML <= maxML); 2139 U_ASSERT(minML <= maxML);
2197 2140
2198 // Insert the min and max match len bounds into the URX_LB_CONT op t hat 2141 // Insert the min and max match len bounds into the URX_LB_CONT op t hat
2199 // appears at the top of the look-behind block, at location fMatchO penParen+1 2142 // appears at the top of the look-behind block, at location fMatchO penParen+1
2200 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2143 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2201 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2144 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2202 2145
2203 } 2146 }
2204 break; 2147 break;
2205 2148
2206 2149
2207 2150
2208 case lookBehindN: 2151 case lookBehindN:
2209 { 2152 {
2210 // See comment at doOpenLookBehindNeg. 2153 // See comment at doOpenLookBehindNeg.
2211 2154
2212 // Append the URX_LBN_END to the compiled pattern. 2155 // Append the URX_LBN_END to the compiled pattern.
2213 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-5); 2156 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen-5);
2214 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2157 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2215 int32_t dataLoc = URX_VAL(startOp); 2158 int32_t dataLoc = URX_VAL(startOp);
2216 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); 2159 int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
2217 appendOp(op); 2160 fRXPat->fCompiledPat->addElement(op, *fStatus);
2218 2161
2219 // Determine the min and max bounds for the length of the 2162 // Determine the min and max bounds for the length of the
2220 // string that the pattern can match. 2163 // string that the pattern can match.
2221 // An unbounded upper limit is an error. 2164 // An unbounded upper limit is an error.
2222 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2165 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2223 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2166 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2224 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2167 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2225 if (URX_TYPE(maxML) != 0) {
2226 error(U_REGEX_LOOK_BEHIND_LIMIT);
2227 break;
2228 }
2229 if (maxML == INT32_MAX) { 2168 if (maxML == INT32_MAX) {
2230 error(U_REGEX_LOOK_BEHIND_LIMIT); 2169 error(U_REGEX_LOOK_BEHIND_LIMIT);
2231 break; 2170 break;
2232 } 2171 }
2233 U_ASSERT(minML <= maxML); 2172 U_ASSERT(minML <= maxML);
2234 2173
2235 // Insert the min and max match len bounds into the URX_LB_CONT op t hat 2174 // Insert the min and max match len bounds into the URX_LB_CONT op t hat
2236 // appears at the top of the look-behind block, at location fMatchO penParen+1 2175 // appears at the top of the look-behind block, at location fMatchO penParen+1
2237 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2176 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2238 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2177 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
2273 // There shoudn't be any, but just in case. 2212 // There shoudn't be any, but just in case.
2274 // (Case Closure can add them; if we had a simple case closure avaialble that 2213 // (Case Closure can add them; if we had a simple case closure avaialble that
2275 // ignored strings, that would be better.) 2214 // ignored strings, that would be better.)
2276 theSet->removeAllStrings(); 2215 theSet->removeAllStrings();
2277 int32_t setSize = theSet->size(); 2216 int32_t setSize = theSet->size();
2278 2217
2279 switch (setSize) { 2218 switch (setSize) {
2280 case 0: 2219 case 0:
2281 { 2220 {
2282 // Set of no elements. Always fails to match. 2221 // Set of no elements. Always fails to match.
2283 appendOp(URX_BUILD(URX_BACKTRACK, 0)); 2222 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStat us);
2284 delete theSet; 2223 delete theSet;
2285 } 2224 }
2286 break; 2225 break;
2287 2226
2288 case 1: 2227 case 1:
2289 { 2228 {
2290 // The set contains only a single code point. Put it into 2229 // The set contains only a single code point. Put it into
2291 // the compiled pattern as a single char operation rather 2230 // the compiled pattern as a single char operation rather
2292 // than a set, and discard the set itself. 2231 // than a set, and discard the set itself.
2293 literalChar(theSet->charAt(0)); 2232 literalChar(theSet->charAt(0));
2294 delete theSet; 2233 delete theSet;
2295 } 2234 }
2296 break; 2235 break;
2297 2236
2298 default: 2237 default:
2299 { 2238 {
2300 // The set contains two or more chars. (the normal case) 2239 // The set contains two or more chars. (the normal case)
2301 // Put it into the compiled pattern as a set. 2240 // Put it into the compiled pattern as a set.
2302 int32_t setNumber = fRXPat->fSets->size(); 2241 int32_t setNumber = fRXPat->fSets->size();
2303 fRXPat->fSets->addElement(theSet, *fStatus); 2242 fRXPat->fSets->addElement(theSet, *fStatus);
2304 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); 2243 int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
2305 appendOp(setOp); 2244 fRXPat->fCompiledPat->addElement(setOp, *fStatus);
2306 } 2245 }
2307 } 2246 }
2308 } 2247 }
2309 2248
2310 2249
2311 //------------------------------------------------------------------------------ 2250 //------------------------------------------------------------------------------
2312 // 2251 //
2313 // compileInterval Generate the code for a {min, max} style interval quanti fier. 2252 // compileInterval Generate the code for a {min, max} style interval quanti fier.
2314 // Except for the specific opcodes used, the code is the sa me 2253 // Except for the specific opcodes used, the code is the sa me
2315 // for all three types (greedy, non-greedy, possessive) of 2254 // for all three types (greedy, non-greedy, possessive) of
(...skipping 18 matching lines...) Expand all
2334 int32_t topOfBlock = blockTopLoc(TRUE); 2273 int32_t topOfBlock = blockTopLoc(TRUE);
2335 insertOp(topOfBlock); 2274 insertOp(topOfBlock);
2336 insertOp(topOfBlock); 2275 insertOp(topOfBlock);
2337 insertOp(topOfBlock); 2276 insertOp(topOfBlock);
2338 2277
2339 // The operands for the CTR_INIT opcode include the index in the matcher dat a 2278 // The operands for the CTR_INIT opcode include the index in the matcher dat a
2340 // of the counter. Allocate it now. There are two data items 2279 // of the counter. Allocate it now. There are two data items
2341 // counterLoc --> Loop counter 2280 // counterLoc --> Loop counter
2342 // +1 --> Input index (for breaking non-progressing loops) 2281 // +1 --> Input index (for breaking non-progressing loops)
2343 // (Only present if unbounded upper limit on loop) 2282 // (Only present if unbounded upper limit on loop)
2344 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; 2283 int32_t counterLoc = fRXPat->fFrameSize;
2345 int32_t counterLoc = allocateStackData(dataSize); 2284 fRXPat->fFrameSize++;
2285 if (fIntervalUpper < 0) {
2286 fRXPat->fFrameSize++;
2287 }
2346 2288
2347 int32_t op = URX_BUILD(InitOp, counterLoc); 2289 int32_t op = URX_BUILD(InitOp, counterLoc);
2348 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2290 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2349 2291
2350 // The second operand of CTR_INIT is the location following the end of the l oop. 2292 // The second operand of CTR_INIT is the location following the end of the l oop.
2351 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2293 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2352 // compilation of something later on causes the code to grow and the targe t 2294 // compilation of something later on causes the code to grow and the targe t
2353 // position to move. 2295 // position to move.
2354 int32_t loopEnd = fRXPat->fCompiledPat->size(); 2296 int32_t loopEnd = fRXPat->fCompiledPat->size();
2355 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); 2297 op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
2356 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2298 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2357 2299
2358 // Followed by the min and max counts. 2300 // Followed by the min and max counts.
2359 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2301 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2360 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2302 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2361 2303
2362 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2304 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2363 // Goes at end of the block being looped over, so just append to the code so far. 2305 // Goes at end of the block being looped over, so just append to the code so far.
2364 op = URX_BUILD(LoopOp, topOfBlock); 2306 op = URX_BUILD(LoopOp, topOfBlock);
2365 appendOp(op); 2307 fRXPat->fCompiledPat->addElement(op, *fStatus);
2366 2308
2367 if ((fIntervalLow & 0xff000000) != 0 || 2309 if ((fIntervalLow & 0xff000000) != 0 ||
2368 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2310 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2369 error(U_REGEX_NUMBER_TOO_BIG); 2311 error(U_REGEX_NUMBER_TOO_BIG);
2370 } 2312 }
2371 2313
2372 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2314 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2373 error(U_REGEX_MAX_LT_MIN); 2315 error(U_REGEX_MAX_LT_MIN);
2374 } 2316 }
2375 } 2317 }
2376 2318
2377 2319
2378 2320
2379 UBool RegexCompile::compileInlineInterval() { 2321 UBool RegexCompile::compileInlineInterval() {
2380 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2322 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2381 // Too big to inline. Fail, which will cause looping code to be generat ed. 2323 // Too big to inline. Fail, which will cause looping code to be generat ed.
2382 // (Upper < Lower picks up unbounded upper and errors, both.) 2324 // (Upper < Lower picks up unbounded upper and errors, both.)
2383 return FALSE; 2325 return FALSE;
2384 } 2326 }
2385 2327
2386 int32_t topOfBlock = blockTopLoc(FALSE); 2328 int32_t topOfBlock = blockTopLoc(FALSE);
2387 if (fIntervalUpper == 0) { 2329 if (fIntervalUpper == 0) {
2388 // Pathological case. Attempt no matches, as if the block doesn't exist . 2330 // Pathological case. Attempt no matches, as if the block doesn't exist .
2389 // Discard the generated code for the block.
2390 // If the block included parens, discard the info pertaining to them as well.
2391 fRXPat->fCompiledPat->setSize(topOfBlock); 2331 fRXPat->fCompiledPat->setSize(topOfBlock);
2392 if (fMatchOpenParen >= topOfBlock) {
2393 fMatchOpenParen = -1;
2394 }
2395 if (fMatchCloseParen >= topOfBlock) {
2396 fMatchCloseParen = -1;
2397 }
2398 return TRUE; 2332 return TRUE;
2399 } 2333 }
2400 2334
2401 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2335 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2402 // The thing being repeated is not a single op, but some 2336 // The thing being repeated is not a single op, but some
2403 // more complex block. Do it as a loop, not inlines. 2337 // more complex block. Do it as a loop, not inlines.
2404 // Note that things "repeated" a max of once are handled as inline, be cause 2338 // Note that things "repeated" a max of once are handled as inline, be cause
2405 // the one copy of the code already generated is just fine. 2339 // the one copy of the code already generated is just fine.
2406 return FALSE; 2340 return FALSE;
2407 } 2341 }
(...skipping 14 matching lines...) Expand all
2422 } 2356 }
2423 2357
2424 2358
2425 2359
2426 // Loop, emitting the op for the thing being repeated each time. 2360 // Loop, emitting the op for the thing being repeated each time.
2427 // Loop starts at 1 because one instance of the op already exists in the pattern, 2361 // Loop starts at 1 because one instance of the op already exists in the pattern,
2428 // it was put there when it was originally encountered. 2362 // it was put there when it was originally encountered.
2429 int32_t i; 2363 int32_t i;
2430 for (i=1; i<fIntervalUpper; i++ ) { 2364 for (i=1; i<fIntervalUpper; i++ ) {
2431 if (i == fIntervalLow) { 2365 if (i == fIntervalLow) {
2432 appendOp(saveOp); 2366 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2433 } 2367 }
2434 if (i > fIntervalLow) { 2368 if (i > fIntervalLow) {
2435 appendOp(saveOp); 2369 fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2436 } 2370 }
2437 appendOp(op); 2371 fRXPat->fCompiledPat->addElement(op, *fStatus);
2438 } 2372 }
2439 return TRUE; 2373 return TRUE;
2440 } 2374 }
2441 2375
2442 2376
2443 2377
2444 //------------------------------------------------------------------------------ 2378 //------------------------------------------------------------------------------
2445 // 2379 //
2380 // caseInsensitiveStart given a single code point from a pattern string, dete rmine the
2381 // set of characters that could potentially begin a case -insensitive
2382 // match of a string beginning with that character, usin g full Unicode
2383 // case insensitive matching.
2384 //
2385 // This is used in optimizing find().
2386 //
2387 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2388 // misses cases like this:
2389 // A string from the pattern begins with 'ss' (although all we know
2390 // in this context is that it begins with 's')
2391 // The pattern could match a string beginning with a German sharp-s
2392 //
2393 // To the ordinary case closure for a character c, we add all other
2394 // characters cx where the case closure of cx incudes a string form th at begins
2395 // with the original character c.
2396 //
2397 // This function could be made smarter. The full pattern string is ava ilable
2398 // and it would be possible to verify that the extra characters being added
2399 // to the starting set fully match, rather than having just a first-ch ar of the
2400 // folded form match.
2401 //
2402 //------------------------------------------------------------------------------
2403 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterCh ars) {
2404
2405 // Machine Generated below.
2406 // It may need updating with new versions of Unicode.
2407 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update i s needed.
2408 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/tru nk/unicode/c/genregexcasing
2409
2410 // Machine Generated Data. Do not hand edit.
2411 static const UChar32 RECaseFixCodePoints[] = {
2412 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2413 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2414 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x 1f07,
2415 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2416 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2417
2418 static const int16_t RECaseFixStringOffsets[] = {
2419 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2420 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2421 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2422 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2423 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2424
2425 static const int16_t RECaseFixCounts[] = {
2426 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2427 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2428 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2429 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2430 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2431
2432 static const UChar RECaseFixData[] = {
2433 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0x df,
2434 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0 x1fb3,
2435 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0 x1fd3,
2436 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0 x1fe3,
2437 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0 xfb14,
2438 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2439 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2440 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2441 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2442 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2443 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2444
2445 // End of machine generated data.
2446
2447 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2448 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2449 starterChars->set(caseFoldedC, caseFoldedC);
2450
2451 int32_t i;
2452 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2453 // Simple linear search through the sorted list of interesting code points.
2454 }
2455
2456 if (RECaseFixCodePoints[i] == c) {
2457 int32_t dataIndex = RECaseFixStringOffsets[i];
2458 int32_t numCharsToAdd = RECaseFixCounts[i];
2459 UChar32 cpToAdd = 0;
2460 for (int32_t j=0; j<numCharsToAdd; j++) {
2461 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2462 starterChars->add(cpToAdd);
2463 }
2464 }
2465
2466 starterChars->closeOver(USET_CASE_INSENSITIVE);
2467 starterChars->removeAllStrings();
2468 } else {
2469 // Not a cased character. Just return it alone.
2470 starterChars->set(c, c);
2471 }
2472 }
2473
2474
2475
2476
2477 //------------------------------------------------------------------------------
2478 //
2446 // matchStartType Determine how a match can start. 2479 // matchStartType Determine how a match can start.
2447 // Used to optimize find() operations. 2480 // Used to optimize find() operations.
2448 // 2481 //
2449 // Operation is very similar to minMatchLength(). Walk the compiled 2482 // Operation is very similar to minMatchLength(). Walk the compiled
2450 // pattern, keeping an on-going minimum-match-length. For a ny 2483 // pattern, keeping an on-going minimum-match-length. For a ny
2451 // op where the min match coming in is zero, add that ops po ssible 2484 // op where the min match coming in is zero, add that ops po ssible
2452 // starting matches to the possible starts for the overall p attern. 2485 // starting matches to the possible starts for the overall p attern.
2453 // 2486 //
2454 //------------------------------------------------------------------------------ 2487 //------------------------------------------------------------------------------
2455 void RegexCompile::matchStartType() { 2488 void RegexCompile::matchStartType() {
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
2507 case URX_BACKSLASH_G: 2540 case URX_BACKSLASH_G:
2508 case URX_BACKSLASH_Z: 2541 case URX_BACKSLASH_Z:
2509 case URX_DOLLAR: 2542 case URX_DOLLAR:
2510 case URX_DOLLAR_M: 2543 case URX_DOLLAR_M:
2511 case URX_DOLLAR_D: 2544 case URX_DOLLAR_D:
2512 case URX_DOLLAR_MD: 2545 case URX_DOLLAR_MD:
2513 case URX_RELOC_OPRND: 2546 case URX_RELOC_OPRND:
2514 case URX_STO_INP_LOC: 2547 case URX_STO_INP_LOC:
2515 case URX_BACKREF: // BackRef. Must assume that it might be a ze ro length match 2548 case URX_BACKREF: // BackRef. Must assume that it might be a ze ro length match
2516 case URX_BACKREF_I: 2549 case URX_BACKREF_I:
2517 2550
2518 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe sn't change what can match. 2551 case URX_STO_SP: // Setup for atomic or possessive blocks. Doe sn't change what can match.
2519 case URX_LD_SP: 2552 case URX_LD_SP:
2520 break; 2553 break;
2521 2554
2522 case URX_CARET: 2555 case URX_CARET:
2523 if (atStart) { 2556 if (atStart) {
2524 fRXPat->fStartType = START_START; 2557 fRXPat->fStartType = START_START;
2525 } 2558 }
2526 break; 2559 break;
2527 2560
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
2624 currentLen++; 2657 currentLen++;
2625 atStart = FALSE; 2658 atStart = FALSE;
2626 break; 2659 break;
2627 2660
2628 2661
2629 case URX_ONECHAR_I: 2662 case URX_ONECHAR_I:
2630 // Case Insensitive Single Character. 2663 // Case Insensitive Single Character.
2631 if (currentLen == 0) { 2664 if (currentLen == 0) {
2632 UChar32 c = URX_VAL(op); 2665 UChar32 c = URX_VAL(op);
2633 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2666 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2634 2667 UnicodeSet starters(c, c);
2635 // Disable optimizations on first char of match. 2668 starters.closeOver(USET_CASE_INSENSITIVE);
2636 // TODO: Compute the set of chars that case fold to this cha r, or to 2669 // findCaseInsensitiveStarters(c, &starters);
2637 // a string that begins with this char. 2670 // For ONECHAR_I, no need to worry about text chars that e xpand on folding into strings.
2638 // For simple case folding, this code worked: 2671 // The expanded folding can't match the pattern.
2639 // UnicodeSet s(c, c); 2672 fRXPat->fInitialChars->addAll(starters);
2640 // s.closeOver(USET_CASE_INSENSITIVE);
2641 // fRXPat->fInitialChars->addAll(s);
2642
2643 fRXPat->fInitialChars->clear();
2644 fRXPat->fInitialChars->complement();
2645 } else { 2673 } else {
2646 // Char has no case variants. Just add it as-is to the 2674 // Char has no case variants. Just add it as-is to the
2647 // set of possible starting chars. 2675 // set of possible starting chars.
2648 fRXPat->fInitialChars->add(c); 2676 fRXPat->fInitialChars->add(c);
2649 } 2677 }
2650 numInitialStrings += 2; 2678 numInitialStrings += 2;
2651 } 2679 }
2652 currentLen++; 2680 currentLen++;
2653 atStart = FALSE; 2681 atStart = FALSE;
2654 break; 2682 break;
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
2757 loc++; 2785 loc++;
2758 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti( loc); 2786 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti( loc);
2759 int32_t stringLen = URX_VAL(stringLenOp); 2787 int32_t stringLen = URX_VAL(stringLenOp);
2760 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2788 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2761 U_ASSERT(stringLenOp >= 2); 2789 U_ASSERT(stringLenOp >= 2);
2762 if (currentLen == 0) { 2790 if (currentLen == 0) {
2763 // Add the starting character of this string to the set of p ossible starting 2791 // Add the starting character of this string to the set of p ossible starting
2764 // characters for this pattern. 2792 // characters for this pattern.
2765 int32_t stringStartIdx = URX_VAL(op); 2793 int32_t stringStartIdx = URX_VAL(op);
2766 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2794 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2767 UnicodeSet s(c, c); 2795 UnicodeSet s;
2768 2796 findCaseInsensitiveStarters(c, &s);
2769 // TODO: compute correct set of starting chars for full cas e folding.
2770 // For the moment, say any char can start.
2771 // s.closeOver(USET_CASE_INSENSITIVE);
2772 s.clear();
2773 s.complement();
2774
2775 fRXPat->fInitialChars->addAll(s); 2797 fRXPat->fInitialChars->addAll(s);
2776 numInitialStrings += 2; // Matching on an initial string no t possible. 2798 numInitialStrings += 2; // Matching on an initial string no t possible.
2777 } 2799 }
2778 currentLen += stringLen; 2800 currentLen += stringLen;
2779 atStart = FALSE; 2801 atStart = FALSE;
2780 } 2802 }
2781 break; 2803 break;
2782 2804
2783 case URX_CTR_INIT: 2805 case URX_CTR_INIT:
2784 case URX_CTR_INIT_NG: 2806 case URX_CTR_INIT_NG:
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
2820 // don't change the minimum match 2842 // don't change the minimum match
2821 atStart = FALSE; 2843 atStart = FALSE;
2822 break; 2844 break;
2823 2845
2824 2846
2825 case URX_LA_START: 2847 case URX_LA_START:
2826 case URX_LB_START: 2848 case URX_LB_START:
2827 { 2849 {
2828 // Look-around. Scan forward until the matching look-ahead end, 2850 // Look-around. Scan forward until the matching look-ahead end,
2829 // without processing the look-around block. This is overly p essimistic. 2851 // without processing the look-around block. This is overly p essimistic.
2830 2852
2831 // Keep track of the nesting depth of look-around blocks. Boile rplate code for 2853 // Keep track of the nesting depth of look-around blocks. Boile rplate code for
2832 // lookahead contains two LA_END instructions, so count goes u p by two 2854 // lookahead contains two LA_END instructions, so count goes u p by two
2833 // for each LA_START. 2855 // for each LA_START.
2834 int32_t depth = (opType == URX_LA_START? 2: 1); 2856 int32_t depth = (opType == URX_LA_START? 2: 1);
2835 for (;;) { 2857 for (;;) {
2836 loc++; 2858 loc++;
2837 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2859 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2838 if (URX_TYPE(op) == URX_LA_START) { 2860 if (URX_TYPE(op) == URX_LA_START) {
2839 depth+=2; 2861 depth+=2;
2840 } 2862 }
(...skipping 539 matching lines...) Expand 10 before | Expand all | Expand 10 after
3380 3402
3381 case URX_STRING_I: 3403 case URX_STRING_I:
3382 // TODO: This code assumes that any user string that matches will b e no longer 3404 // TODO: This code assumes that any user string that matches will b e no longer
3383 // than our compiled string, with case insensitive matching. 3405 // than our compiled string, with case insensitive matching.
3384 // Our compiled string has been case-folded already. 3406 // Our compiled string has been case-folded already.
3385 // 3407 //
3386 // Any matching user string will have no more code points tha n our 3408 // Any matching user string will have no more code points tha n our
3387 // compiled (folded) string. Folding may add code points, bu t 3409 // compiled (folded) string. Folding may add code points, bu t
3388 // not remove them. 3410 // not remove them.
3389 // 3411 //
3390 // There is a potential problem if a supplemental code point 3412 // There is a potential problem if a supplemental code point
3391 // case-folds to a BMP code point. In this case our compiled string 3413 // case-folds to a BMP code point. In this case our compiled string
3392 // could be shorter (in code units) than a matching user stri ng. 3414 // could be shorter (in code units) than a matching user stri ng.
3393 // 3415 //
3394 // At this time (Unicode 6.1) there are no such characters, a nd this case 3416 // At this time (Unicode 6.1) there are no such characters, a nd this case
3395 // is not being handled. A test, intltest regex/Bug9283, wil l fail if 3417 // is not being handled. A test, intltest regex/Bug9283, wil l fail if
3396 // any problematic characters are added to Unicode. 3418 // any problematic characters are added to Unicode.
3397 // 3419 //
3398 // If this happens, we can make a set of the BMP chars that t he 3420 // If this happens, we can make a set of the BMP chars that t he
3399 // troublesome supplementals fold to, scan our string, and bu mp the 3421 // troublesome supplementals fold to, scan our string, and bu mp the
3400 // currentLen one extra for each that is found. 3422 // currentLen one extra for each that is found.
(...skipping 10 matching lines...) Expand all
3411 // For Loops, recursively call this function on the pattern for the loop body, 3433 // For Loops, recursively call this function on the pattern for the loop body,
3412 // then multiply the result by the maximum loop count. 3434 // then multiply the result by the maximum loop count.
3413 { 3435 {
3414 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(l oc+1)); 3436 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(l oc+1));
3415 if (loopEndLoc == loc+4) { 3437 if (loopEndLoc == loc+4) {
3416 // Loop has an empty body. No affect on max match length. 3438 // Loop has an empty body. No affect on max match length.
3417 // Continue processing with code after the loop end. 3439 // Continue processing with code after the loop end.
3418 loc = loopEndLoc; 3440 loc = loopEndLoc;
3419 break; 3441 break;
3420 } 3442 }
3421 3443
3422 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); 3444 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
3423 if (maxLoopCount == -1) { 3445 if (maxLoopCount == -1) {
3424 // Unbounded Loop. No upper bound on match length. 3446 // Unbounded Loop. No upper bound on match length.
3425 currentLen = INT32_MAX; 3447 currentLen = INT32_MAX;
3426 break; 3448 break;
3427 } 3449 }
3428 3450
3429 U_ASSERT(loopEndLoc >= loc+4); 3451 U_ASSERT(loopEndLoc >= loc+4);
3430 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec ursive call. 3452 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Rec ursive call.
3431 if (blockLen == INT32_MAX) { 3453 if (blockLen == INT32_MAX) {
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
3529 // will be offset at each location in the original code. 3551 // will be offset at each location in the original code.
3530 int32_t loc; 3552 int32_t loc;
3531 int32_t d = 0; 3553 int32_t d = 0;
3532 for (loc=0; loc<end; loc++) { 3554 for (loc=0; loc<end; loc++) {
3533 deltas.addElement(d, *fStatus); 3555 deltas.addElement(d, *fStatus);
3534 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3556 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3535 if (URX_TYPE(op) == URX_NOP) { 3557 if (URX_TYPE(op) == URX_NOP) {
3536 d++; 3558 d++;
3537 } 3559 }
3538 } 3560 }
3539 3561
3540 UnicodeString caseStringBuffer; 3562 UnicodeString caseStringBuffer;
3541 3563
3542 // Make a second pass over the code, removing the NOPs by moving following 3564 // Make a second pass over the code, removing the NOPs by moving following
3543 // code up, and patching operands that refer to code locations that 3565 // code up, and patching operands that refer to code locations that
3544 // are being moved. The array of offsets from the first step is used 3566 // are being moved. The array of offsets from the first step is used
3545 // to compute the new operand values. 3567 // to compute the new operand values.
3546 int32_t src; 3568 int32_t src;
3547 int32_t dst = 0; 3569 int32_t dst = 0;
3548 for (src=0; src<end; src++) { 3570 for (src=0; src<end; src++) {
3549 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3571 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
(...skipping 26 matching lines...) Expand all
3576 { 3598 {
3577 int32_t where = URX_VAL(op); 3599 int32_t where = URX_VAL(op);
3578 if (where > fRXPat->fGroupMap->size()) { 3600 if (where > fRXPat->fGroupMap->size()) {
3579 error(U_REGEX_INVALID_BACK_REF); 3601 error(U_REGEX_INVALID_BACK_REF);
3580 break; 3602 break;
3581 } 3603 }
3582 where = fRXPat->fGroupMap->elementAti(where-1); 3604 where = fRXPat->fGroupMap->elementAti(where-1);
3583 op = URX_BUILD(opType, where); 3605 op = URX_BUILD(opType, where);
3584 fRXPat->fCompiledPat->setElementAt(op, dst); 3606 fRXPat->fCompiledPat->setElementAt(op, dst);
3585 dst++; 3607 dst++;
3586 3608
3587 fRXPat->fNeedsAltInput = TRUE; 3609 fRXPat->fNeedsAltInput = TRUE;
3588 break; 3610 break;
3589 } 3611 }
3590 case URX_RESERVED_OP: 3612 case URX_RESERVED_OP:
3591 case URX_RESERVED_OP_N: 3613 case URX_RESERVED_OP_N:
3592 case URX_BACKTRACK: 3614 case URX_BACKTRACK:
3593 case URX_END: 3615 case URX_END:
3594 case URX_ONECHAR: 3616 case URX_ONECHAR:
3595 case URX_STRING: 3617 case URX_STRING:
3596 case URX_STRING_LEN: 3618 case URX_STRING_LEN:
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after
3667 if (fLineNum > 0x7FFFFFFF) { 3689 if (fLineNum > 0x7FFFFFFF) {
3668 fParseErr->line = 0; 3690 fParseErr->line = 0;
3669 fParseErr->offset = -1; 3691 fParseErr->offset = -1;
3670 } else if (fCharNum > 0x7FFFFFFF) { 3692 } else if (fCharNum > 0x7FFFFFFF) {
3671 fParseErr->line = (int32_t)fLineNum; 3693 fParseErr->line = (int32_t)fLineNum;
3672 fParseErr->offset = -1; 3694 fParseErr->offset = -1;
3673 } else { 3695 } else {
3674 fParseErr->line = (int32_t)fLineNum; 3696 fParseErr->line = (int32_t)fLineNum;
3675 fParseErr->offset = (int32_t)fCharNum; 3697 fParseErr->offset = (int32_t)fCharNum;
3676 } 3698 }
3677 3699
3678 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con text 3700 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting con text
3679 3701
3680 // Fill in the context. 3702 // Fill in the context.
3681 // Note: extractBetween() pins supplied indicies to the string bounds. 3703 // Note: extractBetween() pins supplied indicies to the string bounds.
3682 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3704 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3683 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 3705 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3684 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 3706 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanI ndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3685 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3707 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_L EN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3686 } 3708 }
3687 } 3709 }
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
3721 // 3743 //
3722 //------------------------------------------------------------------------------ 3744 //------------------------------------------------------------------------------
3723 UChar32 RegexCompile::nextCharLL() { 3745 UChar32 RegexCompile::nextCharLL() {
3724 UChar32 ch; 3746 UChar32 ch;
3725 3747
3726 if (fPeekChar != -1) { 3748 if (fPeekChar != -1) {
3727 ch = fPeekChar; 3749 ch = fPeekChar;
3728 fPeekChar = -1; 3750 fPeekChar = -1;
3729 return ch; 3751 return ch;
3730 } 3752 }
3731 3753
3732 // assume we're already in the right place 3754 // assume we're already in the right place
3733 ch = UTEXT_NEXT32(fRXPat->fPattern); 3755 ch = UTEXT_NEXT32(fRXPat->fPattern);
3734 if (ch == U_SENTINEL) { 3756 if (ch == U_SENTINEL) {
3735 return ch; 3757 return ch;
3736 } 3758 }
3737 3759
3738 if (ch == chCR || 3760 if (ch == chCR ||
3739 ch == chNEL || 3761 ch == chNEL ||
3740 ch == chLS || 3762 ch == chLS ||
3741 (ch == chLF && fLastChar != chCR)) { 3763 (ch == chLF && fLastChar != chCR)) {
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
3777 // 3799 //
3778 //------------------------------------------------------------------------------ 3800 //------------------------------------------------------------------------------
3779 void RegexCompile::nextChar(RegexPatternChar &c) { 3801 void RegexCompile::nextChar(RegexPatternChar &c) {
3780 3802
3781 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3803 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3782 c.fChar = nextCharLL(); 3804 c.fChar = nextCharLL();
3783 c.fQuoted = FALSE; 3805 c.fQuoted = FALSE;
3784 3806
3785 if (fQuoteMode) { 3807 if (fQuoteMode) {
3786 c.fQuoted = TRUE; 3808 c.fQuoted = TRUE;
3787 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_ LITERAL) == 0)) || 3809 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_ LITERAL) == 0)) ||
3788 c.fChar == (UChar32)-1) { 3810 c.fChar == (UChar32)-1) {
3789 fQuoteMode = FALSE; // Exit quote mode, 3811 fQuoteMode = FALSE; // Exit quote mode,
3790 nextCharLL(); // discard the E 3812 nextCharLL(); // discard the E
3791 nextChar(c); // recurse to get the real next char 3813 nextChar(c); // recurse to get the real next char
3792 } 3814 }
3793 } 3815 }
3794 else if (fInBackslashQuote) { 3816 else if (fInBackslashQuote) {
3795 // The current character immediately follows a '\' 3817 // The current character immediately follows a '\'
3796 // Don't check for any further escapes, just return it as-is. 3818 // Don't check for any further escapes, just return it as-is.
3797 // Don't set c.fQuoted, because that would prevent the state machine fro m 3819 // Don't set c.fQuoted, because that would prevent the state machine fro m
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
3838 if (c.fChar == chBackSlash) { 3860 if (c.fChar == chBackSlash) {
3839 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3861 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3840 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar LL())) { 3862 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekChar LL())) {
3841 // 3863 //
3842 // A '\' sequence that is handled by ICU's standard unescapeAt f unction. 3864 // A '\' sequence that is handled by ICU's standard unescapeAt f unction.
3843 // Includes \uxxxx, \n, \r, many others. 3865 // Includes \uxxxx, \n, \r, many others.
3844 // Return the single equivalent character. 3866 // Return the single equivalent character.
3845 // 3867 //
3846 nextCharLL(); // get & discard the peeked char. 3868 nextCharLL(); // get & discard the peeked char.
3847 c.fQuoted = TRUE; 3869 c.fQuoted = TRUE;
3848 3870
3849 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 3871 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
3850 int32_t endIndex = (int32_t)pos; 3872 int32_t endIndex = (int32_t)pos;
3851 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 3873 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endInd ex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
3852 3874
3853 if (endIndex == pos) { 3875 if (endIndex == pos) {
3854 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3876 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3855 } 3877 }
3856 fCharNum += endIndex - pos; 3878 fCharNum += endIndex - pos;
3857 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 3879 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
3858 } else { 3880 } else {
3859 int32_t offset = 0; 3881 int32_t offset = 0;
3860 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX T_UNESCAPE_CONTEXT(fRXPat->fPattern); 3882 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEX T_UNESCAPE_CONTEXT(fRXPat->fPattern);
3861 3883
3862 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 3884 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
3863 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset , INT32_MAX, &context); 3885 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset , INT32_MAX, &context);
3864 3886
3865 if (offset == 0) { 3887 if (offset == 0) {
3866 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3888 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3867 } else if (context.lastOffset == offset) { 3889 } else if (context.lastOffset == offset) {
3868 UTEXT_PREVIOUS32(fRXPat->fPattern); 3890 UTEXT_PREVIOUS32(fRXPat->fPattern);
3869 } else if (context.lastOffset != offset-1) { 3891 } else if (context.lastOffset != offset-1) {
3870 utext_moveIndex32(fRXPat->fPattern, offset - context.las tOffset - 1); 3892 utext_moveIndex32(fRXPat->fPattern, offset - context.las tOffset - 1);
3871 } 3893 }
(...skipping 22 matching lines...) Expand all
3894 } 3916 }
3895 c.fChar <<= 3; 3917 c.fChar <<= 3;
3896 c.fChar += ch&7; 3918 c.fChar += ch&7;
3897 if (c.fChar <= 255) { 3919 if (c.fChar <= 255) {
3898 nextCharLL(); 3920 nextCharLL();
3899 } else { 3921 } else {
3900 // The last digit made the number too big. Forget we sa w it. 3922 // The last digit made the number too big. Forget we sa w it.
3901 c.fChar >>= 3; 3923 c.fChar >>= 3;
3902 } 3924 }
3903 } 3925 }
3904 c.fQuoted = TRUE; 3926 c.fQuoted = TRUE;
3905 } 3927 }
3906 else if (peekCharLL() == chQ) { 3928 else if (peekCharLL() == chQ) {
3907 // "\Q" enter quote mode, which will continue until "\E" 3929 // "\Q" enter quote mode, which will continue until "\E"
3908 fQuoteMode = TRUE; 3930 fQuoteMode = TRUE;
3909 nextCharLL(); // discard the 'Q'. 3931 nextCharLL(); // discard the 'Q'.
3910 nextChar(c); // recurse to get the real next char. 3932 nextChar(c); // recurse to get the real next char.
3911 } 3933 }
3912 else 3934 else
3913 { 3935 {
3914 // We are in a '\' escape that will be handled by the state tabl e scanner. 3936 // We are in a '\' escape that will be handled by the state tabl e scanner.
3915 // Just return the backslash, but remember that the following ch ar is to 3937 // Just return the backslash, but remember that the following ch ar is to
(...skipping 27 matching lines...) Expand all
3943 UChar32 RegexCompile::scanNamedChar() { 3965 UChar32 RegexCompile::scanNamedChar() {
3944 if (U_FAILURE(*fStatus)) { 3966 if (U_FAILURE(*fStatus)) {
3945 return 0; 3967 return 0;
3946 } 3968 }
3947 3969
3948 nextChar(fC); 3970 nextChar(fC);
3949 if (fC.fChar != chLBrace) { 3971 if (fC.fChar != chLBrace) {
3950 error(U_REGEX_PROPERTY_SYNTAX); 3972 error(U_REGEX_PROPERTY_SYNTAX);
3951 return 0; 3973 return 0;
3952 } 3974 }
3953 3975
3954 UnicodeString charName; 3976 UnicodeString charName;
3955 for (;;) { 3977 for (;;) {
3956 nextChar(fC); 3978 nextChar(fC);
3957 if (fC.fChar == chRBrace) { 3979 if (fC.fChar == chRBrace) {
3958 break; 3980 break;
3959 } 3981 }
3960 if (fC.fChar == -1) { 3982 if (fC.fChar == -1) {
3961 error(U_REGEX_PROPERTY_SYNTAX); 3983 error(U_REGEX_PROPERTY_SYNTAX);
3962 return 0; 3984 return 0;
3963 } 3985 }
3964 charName.append(fC.fChar); 3986 charName.append(fC.fChar);
3965 } 3987 }
3966 3988
3967 char name[100]; 3989 char name[100];
3968 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 3990 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
3969 (uint32_t)charName.length()>=sizeof(name)) { 3991 (uint32_t)charName.length()>=sizeof(name)) {
3970 // All Unicode character names have only invariant characters. 3992 // All Unicode character names have only invariant characters.
3971 // The API to get a character, given a name, accepts only char *, forcin g us to convert, 3993 // The API to get a character, given a name, accepts only char *, forcin g us to convert,
3972 // which requires this error check 3994 // which requires this error check
3973 error(U_REGEX_PROPERTY_SYNTAX); 3995 error(U_REGEX_PROPERTY_SYNTAX);
3974 return 0; 3996 return 0;
3975 } 3997 }
3976 charName.extract(0, charName.length(), name, sizeof(name), US_INV); 3998 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
(...skipping 18 matching lines...) Expand all
3995 // Return a UnicodeSet, constructed from the \P pattern, 4017 // Return a UnicodeSet, constructed from the \P pattern,
3996 // or NULL if the pattern is invalid. 4018 // or NULL if the pattern is invalid.
3997 // 4019 //
3998 //------------------------------------------------------------------------------ 4020 //------------------------------------------------------------------------------
3999 UnicodeSet *RegexCompile::scanProp() { 4021 UnicodeSet *RegexCompile::scanProp() {
4000 UnicodeSet *uset = NULL; 4022 UnicodeSet *uset = NULL;
4001 4023
4002 if (U_FAILURE(*fStatus)) { 4024 if (U_FAILURE(*fStatus)) {
4003 return NULL; 4025 return NULL;
4004 } 4026 }
4027 (void)chLowerP; // Suppress compiler unused variable warning.
4005 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 4028 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4006 UBool negated = (fC.fChar == chP); 4029 UBool negated = (fC.fChar == chP);
4007 4030
4008 UnicodeString propertyName; 4031 UnicodeString propertyName;
4009 nextChar(fC); 4032 nextChar(fC);
4010 if (fC.fChar != chLBrace) { 4033 if (fC.fChar != chLBrace) {
4011 error(U_REGEX_PROPERTY_SYNTAX); 4034 error(U_REGEX_PROPERTY_SYNTAX);
4012 return NULL; 4035 return NULL;
4013 } 4036 }
4014 for (;;) { 4037 for (;;) {
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
4064 UBool savedInBackslashQuote = fInBackslashQuote; 4087 UBool savedInBackslashQuote = fInBackslashQuote;
4065 UBool savedEOLComments = fEOLComments; 4088 UBool savedEOLComments = fEOLComments;
4066 int64_t savedLineNum = fLineNum; 4089 int64_t savedLineNum = fLineNum;
4067 int64_t savedCharNum = fCharNum; 4090 int64_t savedCharNum = fCharNum;
4068 UChar32 savedLastChar = fLastChar; 4091 UChar32 savedLastChar = fLastChar;
4069 UChar32 savedPeekChar = fPeekChar; 4092 UChar32 savedPeekChar = fPeekChar;
4070 RegexPatternChar savedfC = fC; 4093 RegexPatternChar savedfC = fC;
4071 4094
4072 // Scan for a closing ]. A little tricky because there are some perverse 4095 // Scan for a closing ]. A little tricky because there are some perverse
4073 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre ssion, 4096 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expre ssion,
4074 // ending on the second closing ]. 4097 // ending on the second closing ].
4075 4098
4076 UnicodeString propName; 4099 UnicodeString propName;
4077 UBool negated = FALSE; 4100 UBool negated = FALSE;
4078 4101
4079 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette r:] 4102 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Lette r:]
4080 nextChar(fC); 4103 nextChar(fC);
4081 if (fC.fChar == chUp) { 4104 if (fC.fChar == chUp) {
4082 negated = TRUE; 4105 negated = TRUE;
4083 nextChar(fC); 4106 nextChar(fC);
4084 } 4107 }
4085 4108
4086 // Scan for the closing ":]", collecting the property name along the way. 4109 // Scan for the closing ":]", collecting the property name along the way.
4087 UBool sawPropSetTerminator = FALSE; 4110 UBool sawPropSetTerminator = FALSE;
4088 for (;;) { 4111 for (;;) {
4089 propName.append(fC.fChar); 4112 propName.append(fC.fChar);
4090 nextChar(fC); 4113 nextChar(fC);
4091 if (fC.fQuoted || fC.fChar == -1) { 4114 if (fC.fQuoted || fC.fChar == -1) {
4092 // Escaped characters or end of input - either says this isn't a [:P roperty:] 4115 // Escaped characters or end of input - either says this isn't a [:P roperty:]
4093 break; 4116 break;
4094 } 4117 }
4095 if (fC.fChar == chColon) { 4118 if (fC.fChar == chColon) {
4096 nextChar(fC); 4119 nextChar(fC);
4097 if (fC.fChar == chRBracket) { 4120 if (fC.fChar == chRBracket) {
4098 sawPropSetTerminator = TRUE; 4121 sawPropSetTerminator = TRUE;
4099 } 4122 }
4100 break; 4123 break;
4101 } 4124 }
4102 } 4125 }
4103 4126
4104 if (sawPropSetTerminator) { 4127 if (sawPropSetTerminator) {
4105 uset = createSetForProperty(propName, negated); 4128 uset = createSetForProperty(propName, negated);
4106 } 4129 }
4107 else 4130 else
4108 { 4131 {
4109 // No closing ":]". 4132 // No closing ":]".
4110 // Restore the original scan position. 4133 // Restore the original scan position.
4111 // The main scanner will retry the input as a normal set expression, 4134 // The main scanner will retry the input as a normal set expression,
4112 // not a [:Property:] expression. 4135 // not a [:Property:] expression.
4113 fScanIndex = savedScanIndex; 4136 fScanIndex = savedScanIndex;
(...skipping 12 matching lines...) Expand all
4126 4149
4127 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4150 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4128 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4151 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4129 addCategory(set, U_GC_CF_MASK, ec); 4152 addCategory(set, U_GC_CF_MASK, ec);
4130 } 4153 }
4131 4154
4132 // 4155 //
4133 // Create a Unicode Set from a Unicode Property expression. 4156 // Create a Unicode Set from a Unicode Property expression.
4134 // This is common code underlying both \p{...} ane [:...:] expressions. 4157 // This is common code underlying both \p{...} ane [:...:] expressions.
4135 // Includes trying the Java "properties" that aren't supported as 4158 // Includes trying the Java "properties" that aren't supported as
4136 // normal ICU UnicodeSet properties 4159 // normal ICU UnicodeSet properties
4137 // 4160 //
4138 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 4161 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4139 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 4162 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4140 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB ool negated) { 4163 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB ool negated) {
4141 UnicodeString setExpr; 4164 UnicodeString setExpr;
4142 UnicodeSet *set; 4165 UnicodeSet *set;
4143 uint32_t usetFlags = 0; 4166 uint32_t usetFlags = 0;
4144 4167
4145 if (U_FAILURE(*fStatus)) { 4168 if (U_FAILURE(*fStatus)) {
4146 return NULL; 4169 return NULL;
4147 } 4170 }
4148 4171
4149 // 4172 //
4150 // First try the property as we received it 4173 // First try the property as we received it
4151 // 4174 //
4152 if (negated) { 4175 if (negated) {
4153 setExpr.append(negSetPrefix, -1); 4176 setExpr.append(negSetPrefix, -1);
4154 } else { 4177 } else {
4155 setExpr.append(posSetPrefix, -1); 4178 setExpr.append(posSetPrefix, -1);
4156 } 4179 }
4157 setExpr.append(propName); 4180 setExpr.append(propName);
4158 setExpr.append(chRBrace); 4181 setExpr.append(chRBrace);
4159 setExpr.append(chRBracket); 4182 setExpr.append(chRBracket);
4160 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4183 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4161 usetFlags |= USET_CASE_INSENSITIVE; 4184 usetFlags |= USET_CASE_INSENSITIVE;
4162 } 4185 }
4163 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4186 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4164 if (U_SUCCESS(*fStatus)) { 4187 if (U_SUCCESS(*fStatus)) {
4165 return set; 4188 return set;
4166 } 4189 }
4167 delete set; 4190 delete set;
4168 set = NULL; 4191 set = NULL;
4169 4192
4170 // 4193 //
4171 // The property as it was didn't work. 4194 // The property as it was didn't work.
4172 4195
4173 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n ot standard POSIX 4196 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" n ot standard POSIX
4174 // or standard Java, but many other regular expression packages do recog nize it. 4197 // or standard Java, but many other regular expression packages do recog nize it.
4175 4198
4176 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { 4199 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4177 *fStatus = U_ZERO_ERROR; 4200 *fStatus = U_ZERO_ERROR;
4178 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); 4201 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4179 if (set == NULL) { 4202 if (set == NULL) {
4180 *fStatus = U_MEMORY_ALLOCATION_ERROR; 4203 *fStatus = U_MEMORY_ALLOCATION_ERROR;
4181 return set; 4204 return set;
4182 } 4205 }
4183 if (negated) { 4206 if (negated) {
4184 set->complement(); 4207 set->complement();
4185 } 4208 }
4186 return set; 4209 return set;
4187 } 4210 }
4188 4211
4189 4212
4190 // Do Java fixes - 4213 // Do Java fixes -
4191 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 4214 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4192 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols . 4215 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols .
4193 // 4216 //
4194 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin g Marks for Symbols" 4217 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombinin g Marks for Symbols"
4195 // is accepted by Java. The property part of the nam e is compared 4218 // is accepted by Java. The property part of the nam e is compared
4196 // case-insenstively. The spaces must be exactly as shown, either 4219 // case-insenstively. The spaces must be exactly as shown, either
4197 // all there, or all omitted, with exactly one at eac h position 4220 // all there, or all omitted, with exactly one at eac h position
4198 // if they are present. From checking against JDK 1. 6 4221 // if they are present. From checking against JDK 1. 6
4199 // 4222 //
4200 // This code should be removed when ICU properties support the Java c ompatibility names 4223 // This code should be removed when ICU properties support the Java c ompatibility names
4201 // (ICU 4.0?) 4224 // (ICU 4.0?)
4202 // 4225 //
4203 UnicodeString mPropName = propName; 4226 UnicodeString mPropName = propName;
4204 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 4227 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4205 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 4228 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4206 } 4229 }
4207 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo ls"), 0) == 0 || 4230 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbo ls"), 0) == 0 ||
4208 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols" ), 0) == 0) { 4231 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols" ), 0) == 0) {
4209 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym bols"); 4232 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Sym bols");
4210 } 4233 }
4211 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4234 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4212 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 4235 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4213 } 4236 }
4214 4237
4215 // See if the property looks like a Java "InBlockName", which 4238 // See if the property looks like a Java "InBlockName", which
4216 // we will recast as "Block=BlockName" 4239 // we will recast as "Block=BlockName"
4217 // 4240 //
4218 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 4241 static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
4219 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // " Block=" 4242 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // " Block="
4220 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 4243 if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4221 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 4244 setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
4222 setExpr.append(BLOCK, -1); 4245 setExpr.append(BLOCK, -1);
4223 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi ng "In" removed. 4246 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leadi ng "In" removed.
4224 setExpr.append(chRBrace); 4247 setExpr.append(chRBrace);
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after
4328 } 4351 }
4329 if (negated) { 4352 if (negated) {
4330 set->complement(); 4353 set->complement();
4331 } 4354 }
4332 return set; 4355 return set;
4333 } 4356 }
4334 delete set; 4357 delete set;
4335 set = NULL; 4358 set = NULL;
4336 } 4359 }
4337 error(*fStatus); 4360 error(*fStatus);
4338 return NULL; 4361 return NULL;
4339 } 4362 }
4340 4363
4341 4364
4342 4365
4343 // 4366 //
4344 // SetEval Part of the evaluation of [set expressions]. 4367 // SetEval Part of the evaluation of [set expressions].
4345 // Perform any pending (stacked) operations with precedence 4368 // Perform any pending (stacked) operations with precedence
4346 // equal or greater to that of the next operator encountered 4369 // equal or greater to that of the next operator encountered
4347 // in the expression. 4370 // in the expression.
4348 // 4371 //
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
4396 4419
4397 void RegexCompile::setPushOp(int32_t op) { 4420 void RegexCompile::setPushOp(int32_t op) {
4398 setEval(op); 4421 setEval(op);
4399 fSetOpStack.push(op, *fStatus); 4422 fSetOpStack.push(op, *fStatus);
4400 fSetStack.push(new UnicodeSet(), *fStatus); 4423 fSetStack.push(new UnicodeSet(), *fStatus);
4401 } 4424 }
4402 4425
4403 U_NAMESPACE_END 4426 U_NAMESPACE_END
4404 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4427 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4405 4428
OLDNEW
« no previous file with comments | « source/i18n/regexcmp.h ('k') | source/i18n/regeximp.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698