| OLD | NEW |
| 1 // | 1 // |
| 2 // file: repattrn.cpp | 2 // file: repattrn.cpp |
| 3 // | 3 // |
| 4 /* | 4 /* |
| 5 *************************************************************************** | 5 *************************************************************************** |
| 6 * Copyright (C) 2002-2012 International Business Machines Corporation * | 6 * Copyright (C) 2002-2013 International Business Machines Corporation * |
| 7 * and others. All rights reserved. * | 7 * and others. All rights reserved. * |
| 8 *************************************************************************** | 8 *************************************************************************** |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "unicode/utypes.h" | 11 #include "unicode/utypes.h" |
| 12 | 12 |
| 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 14 | 14 |
| 15 #include "unicode/regex.h" | 15 #include "unicode/regex.h" |
| 16 #include "unicode/uclean.h" | 16 #include "unicode/uclean.h" |
| (...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 268 //--------------------------------------------------------------------- | 268 //--------------------------------------------------------------------- |
| 269 RegexPattern * U_EXPORT2 | 269 RegexPattern * U_EXPORT2 |
| 270 RegexPattern::compile(const UnicodeString ®ex, | 270 RegexPattern::compile(const UnicodeString ®ex, |
| 271 uint32_t flags, | 271 uint32_t flags, |
| 272 UParseError &pe, | 272 UParseError &pe, |
| 273 UErrorCode &status) | 273 UErrorCode &status) |
| 274 { | 274 { |
| 275 if (U_FAILURE(status)) { | 275 if (U_FAILURE(status)) { |
| 276 return NULL; | 276 return NULL; |
| 277 } | 277 } |
| 278 | 278 |
| 279 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX
_COMMENTS | | 279 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX
_COMMENTS | |
| 280 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | | 280 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
| 281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITER
AL; | 281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITER
AL; |
| 282 | 282 |
| 283 if ((flags & ~allFlags) != 0) { | 283 if ((flags & ~allFlags) != 0) { |
| 284 status = U_REGEX_INVALID_FLAG; | 284 status = U_REGEX_INVALID_FLAG; |
| 285 return NULL; | 285 return NULL; |
| 286 } | 286 } |
| 287 | 287 |
| 288 if ((flags & UREGEX_CANON_EQ) != 0) { | 288 if ((flags & UREGEX_CANON_EQ) != 0) { |
| 289 status = U_REGEX_UNIMPLEMENTED; | 289 status = U_REGEX_UNIMPLEMENTED; |
| 290 return NULL; | 290 return NULL; |
| 291 } | 291 } |
| 292 | 292 |
| 293 RegexPattern *This = new RegexPattern; | 293 RegexPattern *This = new RegexPattern; |
| 294 if (This == NULL) { | 294 if (This == NULL) { |
| 295 status = U_MEMORY_ALLOCATION_ERROR; | 295 status = U_MEMORY_ALLOCATION_ERROR; |
| 296 return NULL; | 296 return NULL; |
| 297 } | 297 } |
| 298 if (U_FAILURE(This->fDeferredStatus)) { | 298 if (U_FAILURE(This->fDeferredStatus)) { |
| 299 status = This->fDeferredStatus; | 299 status = This->fDeferredStatus; |
| 300 delete This; | 300 delete This; |
| 301 return NULL; | 301 return NULL; |
| 302 } | 302 } |
| 303 This->fFlags = flags; | 303 This->fFlags = flags; |
| 304 | 304 |
| 305 RegexCompile compiler(This, status); | 305 RegexCompile compiler(This, status); |
| 306 compiler.compile(regex, pe, status); | 306 compiler.compile(regex, pe, status); |
| 307 | 307 |
| 308 if (U_FAILURE(status)) { | 308 if (U_FAILURE(status)) { |
| 309 delete This; | 309 delete This; |
| 310 This = NULL; | 310 This = NULL; |
| 311 } | 311 } |
| 312 | 312 |
| 313 return This; | 313 return This; |
| 314 } | 314 } |
| 315 | 315 |
| 316 | 316 |
| 317 // | 317 // |
| 318 // compile, UText mode | 318 // compile, UText mode |
| 319 // | 319 // |
| 320 RegexPattern * U_EXPORT2 | 320 RegexPattern * U_EXPORT2 |
| 321 RegexPattern::compile(UText *regex, | 321 RegexPattern::compile(UText *regex, |
| 322 uint32_t flags, | 322 uint32_t flags, |
| (...skipping 25 matching lines...) Expand all Loading... |
| 348 } | 348 } |
| 349 if (U_FAILURE(This->fDeferredStatus)) { | 349 if (U_FAILURE(This->fDeferredStatus)) { |
| 350 status = This->fDeferredStatus; | 350 status = This->fDeferredStatus; |
| 351 delete This; | 351 delete This; |
| 352 return NULL; | 352 return NULL; |
| 353 } | 353 } |
| 354 This->fFlags = flags; | 354 This->fFlags = flags; |
| 355 | 355 |
| 356 RegexCompile compiler(This, status); | 356 RegexCompile compiler(This, status); |
| 357 compiler.compile(regex, pe, status); | 357 compiler.compile(regex, pe, status); |
| 358 | 358 |
| 359 if (U_FAILURE(status)) { | 359 if (U_FAILURE(status)) { |
| 360 delete This; | 360 delete This; |
| 361 This = NULL; | 361 This = NULL; |
| 362 } | 362 } |
| 363 | 363 |
| 364 return This; | 364 return This; |
| 365 } | 365 } |
| 366 | 366 |
| 367 // | 367 // |
| 368 // compile with default flags. | 368 // compile with default flags. |
| (...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 531 UnicodeString RegexPattern::pattern() const { | 531 UnicodeString RegexPattern::pattern() const { |
| 532 if (fPatternString != NULL) { | 532 if (fPatternString != NULL) { |
| 533 return *fPatternString; | 533 return *fPatternString; |
| 534 } else if (fPattern == NULL) { | 534 } else if (fPattern == NULL) { |
| 535 return UnicodeString(); | 535 return UnicodeString(); |
| 536 } else { | 536 } else { |
| 537 UErrorCode status = U_ZERO_ERROR; | 537 UErrorCode status = U_ZERO_ERROR; |
| 538 int64_t nativeLen = utext_nativeLength(fPattern); | 538 int64_t nativeLen = utext_nativeLength(fPattern); |
| 539 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status);
// buffer overflow error | 539 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status);
// buffer overflow error |
| 540 UnicodeString result; | 540 UnicodeString result; |
| 541 | 541 |
| 542 status = U_ZERO_ERROR; | 542 status = U_ZERO_ERROR; |
| 543 UChar *resultChars = result.getBuffer(len16); | 543 UChar *resultChars = result.getBuffer(len16); |
| 544 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // u
nterminated warning | 544 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // u
nterminated warning |
| 545 result.releaseBuffer(len16); | 545 result.releaseBuffer(len16); |
| 546 | 546 |
| 547 return result; | 547 return result; |
| 548 } | 548 } |
| 549 } | 549 } |
| 550 | 550 |
| 551 | 551 |
| 552 | 552 |
| 553 | 553 |
| 554 //--------------------------------------------------------------------- | 554 //--------------------------------------------------------------------- |
| 555 // | 555 // |
| 556 // patternText | 556 // patternText |
| (...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 615 } | 615 } |
| 616 | 616 |
| 617 | 617 |
| 618 | 618 |
| 619 //--------------------------------------------------------------------- | 619 //--------------------------------------------------------------------- |
| 620 // | 620 // |
| 621 // dump Output the compiled form of the pattern. | 621 // dump Output the compiled form of the pattern. |
| 622 // Debugging function only. | 622 // Debugging function only. |
| 623 // | 623 // |
| 624 //--------------------------------------------------------------------- | 624 //--------------------------------------------------------------------- |
| 625 void RegexPattern::dumpOp(int32_t index) const { |
| 626 (void)index; // Suppress warnings in non-debug build. |
| 625 #if defined(REGEX_DEBUG) | 627 #if defined(REGEX_DEBUG) |
| 626 void RegexPattern::dumpOp(int32_t index) const { | |
| 627 static const char * const opNames[] = {URX_OPCODE_NAMES}; | 628 static const char * const opNames[] = {URX_OPCODE_NAMES}; |
| 628 int32_t op = fCompiledPat->elementAti(index); | 629 int32_t op = fCompiledPat->elementAti(index); |
| 629 int32_t val = URX_VAL(op); | 630 int32_t val = URX_VAL(op); |
| 630 int32_t type = URX_TYPE(op); | 631 int32_t type = URX_TYPE(op); |
| 631 int32_t pinnedType = type; | 632 int32_t pinnedType = type; |
| 632 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { | 633 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { |
| 633 pinnedType = 0; | 634 pinnedType = 0; |
| 634 } | 635 } |
| 635 | 636 |
| 636 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedT
ype])); | 637 printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); |
| 637 switch (type) { | 638 switch (type) { |
| 638 case URX_NOP: | 639 case URX_NOP: |
| 639 case URX_DOTANY: | 640 case URX_DOTANY: |
| 640 case URX_DOTANY_ALL: | 641 case URX_DOTANY_ALL: |
| 641 case URX_FAIL: | 642 case URX_FAIL: |
| 642 case URX_CARET: | 643 case URX_CARET: |
| 643 case URX_DOLLAR: | 644 case URX_DOLLAR: |
| 644 case URX_BACKSLASH_G: | 645 case URX_BACKSLASH_G: |
| 645 case URX_BACKSLASH_X: | 646 case URX_BACKSLASH_X: |
| 646 case URX_END: | 647 case URX_END: |
| (...skipping 28 matching lines...) Expand all Loading... |
| 675 case URX_LA_END: | 676 case URX_LA_END: |
| 676 case URX_BACKREF_I: | 677 case URX_BACKREF_I: |
| 677 case URX_LB_START: | 678 case URX_LB_START: |
| 678 case URX_LB_CONT: | 679 case URX_LB_CONT: |
| 679 case URX_LB_END: | 680 case URX_LB_END: |
| 680 case URX_LBN_CONT: | 681 case URX_LBN_CONT: |
| 681 case URX_LBN_END: | 682 case URX_LBN_END: |
| 682 case URX_LOOP_C: | 683 case URX_LOOP_C: |
| 683 case URX_LOOP_DOT_I: | 684 case URX_LOOP_DOT_I: |
| 684 // types with an integer operand field. | 685 // types with an integer operand field. |
| 685 REGEX_DUMP_DEBUG_PRINTF(("%d", val)); | 686 printf("%d", val); |
| 686 break; | 687 break; |
| 687 | 688 |
| 688 case URX_ONECHAR: | 689 case URX_ONECHAR: |
| 689 case URX_ONECHAR_I: | 690 case URX_ONECHAR_I: |
| 690 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); | 691 printf("%c", val<256?val:'?'); |
| 691 break; | 692 break; |
| 692 | 693 |
| 693 case URX_STRING: | 694 case URX_STRING: |
| 694 case URX_STRING_I: | 695 case URX_STRING_I: |
| 695 { | 696 { |
| 696 int32_t lengthOp = fCompiledPat->elementAti(index+1); | 697 int32_t lengthOp = fCompiledPat->elementAti(index+1); |
| 697 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); | 698 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); |
| 698 int32_t length = URX_VAL(lengthOp); | 699 int32_t length = URX_VAL(lengthOp); |
| 699 int32_t i; | 700 int32_t i; |
| 700 for (i=val; i<val+length; i++) { | 701 for (i=val; i<val+length; i++) { |
| 701 UChar c = fLiteralText[i]; | 702 UChar c = fLiteralText[i]; |
| 702 if (c < 32 || c >= 256) {c = '.';} | 703 if (c < 32 || c >= 256) {c = '.';} |
| 703 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 704 printf("%c", c); |
| 704 } | 705 } |
| 705 } | 706 } |
| 706 break; | 707 break; |
| 707 | 708 |
| 708 case URX_SETREF: | 709 case URX_SETREF: |
| 709 case URX_LOOP_SR_I: | 710 case URX_LOOP_SR_I: |
| 710 { | 711 { |
| 711 UnicodeString s; | 712 UnicodeString s; |
| 712 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); | 713 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); |
| 713 set->toPattern(s, TRUE); | 714 set->toPattern(s, TRUE); |
| 714 for (int32_t i=0; i<s.length(); i++) { | 715 for (int32_t i=0; i<s.length(); i++) { |
| 715 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); | 716 printf("%c", s.charAt(i)); |
| 716 } | 717 } |
| 717 } | 718 } |
| 718 break; | 719 break; |
| 719 | 720 |
| 720 case URX_STATIC_SETREF: | 721 case URX_STATIC_SETREF: |
| 721 case URX_STAT_SETREF_N: | 722 case URX_STAT_SETREF_N: |
| 722 { | 723 { |
| 723 UnicodeString s; | 724 UnicodeString s; |
| 724 if (val & URX_NEG_SET) { | 725 if (val & URX_NEG_SET) { |
| 725 REGEX_DUMP_DEBUG_PRINTF(("NOT ")); | 726 printf("NOT "); |
| 726 val &= ~URX_NEG_SET; | 727 val &= ~URX_NEG_SET; |
| 727 } | 728 } |
| 728 UnicodeSet *set = fStaticSets[val]; | 729 UnicodeSet *set = fStaticSets[val]; |
| 729 set->toPattern(s, TRUE); | 730 set->toPattern(s, TRUE); |
| 730 for (int32_t i=0; i<s.length(); i++) { | 731 for (int32_t i=0; i<s.length(); i++) { |
| 731 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); | 732 printf("%c", s.charAt(i)); |
| 732 } | 733 } |
| 733 } | 734 } |
| 734 break; | 735 break; |
| 735 | 736 |
| 736 | 737 |
| 737 default: | 738 default: |
| 738 REGEX_DUMP_DEBUG_PRINTF(("??????")); | 739 printf("??????"); |
| 739 break; | 740 break; |
| 740 } | 741 } |
| 741 REGEX_DUMP_DEBUG_PRINTF(("\n")); | 742 printf("\n"); |
| 743 #endif |
| 742 } | 744 } |
| 743 #endif | |
| 744 | 745 |
| 745 | 746 |
| 747 void RegexPattern::dumpPattern() const { |
| 746 #if defined(REGEX_DEBUG) | 748 #if defined(REGEX_DEBUG) |
| 747 U_CAPI void U_EXPORT2 | |
| 748 RegexPatternDump(const RegexPattern *This) { | |
| 749 int index; | 749 int index; |
| 750 int i; | 750 int i; |
| 751 | 751 |
| 752 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: ")); | 752 printf("Original Pattern: "); |
| 753 UChar32 c = utext_next32From(This->fPattern, 0); | 753 UChar32 c = utext_next32From(fPattern, 0); |
| 754 while (c != U_SENTINEL) { | 754 while (c != U_SENTINEL) { |
| 755 if (c<32 || c>256) { | 755 if (c<32 || c>256) { |
| 756 c = '.'; | 756 c = '.'; |
| 757 } | 757 } |
| 758 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 758 printf("%c", c); |
| 759 | 759 |
| 760 c = UTEXT_NEXT32(This->fPattern); | 760 c = UTEXT_NEXT32(fPattern); |
| 761 } | 761 } |
| 762 REGEX_DUMP_DEBUG_PRINTF(("\n")); | 762 printf("\n"); |
| 763 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); | 763 printf(" Min Match Length: %d\n", fMinMatchLen); |
| 764 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(Th
is->fStartType))); | 764 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType)); |
| 765 if (This->fStartType == START_STRING) { | 765 if (fStartType == START_STRING) { |
| 766 REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); | 766 printf(" Initial match string: \""); |
| 767 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitial
StringLen; i++) { | 767 for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) { |
| 768 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO:
non-printables, surrogates. | 768 printf("%c", fLiteralText[i]); // TODO: non-printables, surrogate
s. |
| 769 } | 769 } |
| 770 REGEX_DUMP_DEBUG_PRINTF(("\"\n")); | 770 printf("\"\n"); |
| 771 | 771 |
| 772 } else if (This->fStartType == START_SET) { | 772 } else if (fStartType == START_SET) { |
| 773 int32_t numSetChars = This->fInitialChars->size(); | 773 int32_t numSetChars = fInitialChars->size(); |
| 774 if (numSetChars > 20) { | 774 if (numSetChars > 20) { |
| 775 numSetChars = 20; | 775 numSetChars = 20; |
| 776 } | 776 } |
| 777 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); | 777 printf(" Match First Chars : "); |
| 778 for (i=0; i<numSetChars; i++) { | 778 for (i=0; i<numSetChars; i++) { |
| 779 UChar32 c = This->fInitialChars->charAt(i); | 779 UChar32 c = fInitialChars->charAt(i); |
| 780 if (0x20<c && c <0x7e) { | 780 if (0x20<c && c <0x7e) { |
| 781 REGEX_DUMP_DEBUG_PRINTF(("%c ", c)); | 781 printf("%c ", c); |
| 782 } else { | 782 } else { |
| 783 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); | 783 printf("%#x ", c); |
| 784 } | 784 } |
| 785 } | 785 } |
| 786 if (numSetChars < This->fInitialChars->size()) { | 786 if (numSetChars < fInitialChars->size()) { |
| 787 REGEX_DUMP_DEBUG_PRINTF((" ...")); | 787 printf(" ..."); |
| 788 } | 788 } |
| 789 REGEX_DUMP_DEBUG_PRINTF(("\n")); | 789 printf("\n"); |
| 790 | 790 |
| 791 } else if (This->fStartType == START_CHAR) { | 791 } else if (fStartType == START_CHAR) { |
| 792 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : ")); | 792 printf(" First char of Match : "); |
| 793 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) { | 793 if (0x20 < fInitialChar && fInitialChar<0x7e) { |
| 794 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar)); | 794 printf("%c\n", fInitialChar); |
| 795 } else { | 795 } else { |
| 796 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar)); | 796 printf("%#x\n", fInitialChar); |
| 797 } | 797 } |
| 798 } | 798 } |
| 799 | 799 |
| 800 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \ | 800 printf("\nIndex Binary Type Operand\n" \ |
| 801 "-------------------------------------------\n")); | 801 "-------------------------------------------\n"); |
| 802 for (index = 0; index<This->fCompiledPat->size(); index++) { | 802 for (index = 0; index<fCompiledPat->size(); index++) { |
| 803 This->dumpOp(index); | 803 dumpOp(index); |
| 804 } | 804 } |
| 805 REGEX_DUMP_DEBUG_PRINTF(("\n\n")); | 805 printf("\n\n"); |
| 806 #endif |
| 806 } | 807 } |
| 807 #endif | |
| 808 | 808 |
| 809 | 809 |
| 810 | 810 |
| 811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) | 811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
| 812 | 812 |
| 813 U_NAMESPACE_END | 813 U_NAMESPACE_END |
| 814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| OLD | NEW |