OLD | NEW |
1 // | 1 // |
2 // file: repattrn.cpp | 2 // file: repattrn.cpp |
3 // | 3 // |
4 /* | 4 /* |
5 *************************************************************************** | 5 *************************************************************************** |
6 * Copyright (C) 2002-2012 International Business Machines Corporation * | 6 * Copyright (C) 2002-2013 International Business Machines Corporation * |
7 * and others. All rights reserved. * | 7 * and others. All rights reserved. * |
8 *************************************************************************** | 8 *************************************************************************** |
9 */ | 9 */ |
10 | 10 |
11 #include "unicode/utypes.h" | 11 #include "unicode/utypes.h" |
12 | 12 |
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
14 | 14 |
15 #include "unicode/regex.h" | 15 #include "unicode/regex.h" |
16 #include "unicode/uclean.h" | 16 #include "unicode/uclean.h" |
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
268 //--------------------------------------------------------------------- | 268 //--------------------------------------------------------------------- |
269 RegexPattern * U_EXPORT2 | 269 RegexPattern * U_EXPORT2 |
270 RegexPattern::compile(const UnicodeString ®ex, | 270 RegexPattern::compile(const UnicodeString ®ex, |
271 uint32_t flags, | 271 uint32_t flags, |
272 UParseError &pe, | 272 UParseError &pe, |
273 UErrorCode &status) | 273 UErrorCode &status) |
274 { | 274 { |
275 if (U_FAILURE(status)) { | 275 if (U_FAILURE(status)) { |
276 return NULL; | 276 return NULL; |
277 } | 277 } |
278 | 278 |
279 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX
_COMMENTS | | 279 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX
_COMMENTS | |
280 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | | 280 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITER
AL; | 281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITER
AL; |
282 | 282 |
283 if ((flags & ~allFlags) != 0) { | 283 if ((flags & ~allFlags) != 0) { |
284 status = U_REGEX_INVALID_FLAG; | 284 status = U_REGEX_INVALID_FLAG; |
285 return NULL; | 285 return NULL; |
286 } | 286 } |
287 | 287 |
288 if ((flags & UREGEX_CANON_EQ) != 0) { | 288 if ((flags & UREGEX_CANON_EQ) != 0) { |
289 status = U_REGEX_UNIMPLEMENTED; | 289 status = U_REGEX_UNIMPLEMENTED; |
290 return NULL; | 290 return NULL; |
291 } | 291 } |
292 | 292 |
293 RegexPattern *This = new RegexPattern; | 293 RegexPattern *This = new RegexPattern; |
294 if (This == NULL) { | 294 if (This == NULL) { |
295 status = U_MEMORY_ALLOCATION_ERROR; | 295 status = U_MEMORY_ALLOCATION_ERROR; |
296 return NULL; | 296 return NULL; |
297 } | 297 } |
298 if (U_FAILURE(This->fDeferredStatus)) { | 298 if (U_FAILURE(This->fDeferredStatus)) { |
299 status = This->fDeferredStatus; | 299 status = This->fDeferredStatus; |
300 delete This; | 300 delete This; |
301 return NULL; | 301 return NULL; |
302 } | 302 } |
303 This->fFlags = flags; | 303 This->fFlags = flags; |
304 | 304 |
305 RegexCompile compiler(This, status); | 305 RegexCompile compiler(This, status); |
306 compiler.compile(regex, pe, status); | 306 compiler.compile(regex, pe, status); |
307 | 307 |
308 if (U_FAILURE(status)) { | 308 if (U_FAILURE(status)) { |
309 delete This; | 309 delete This; |
310 This = NULL; | 310 This = NULL; |
311 } | 311 } |
312 | 312 |
313 return This; | 313 return This; |
314 } | 314 } |
315 | 315 |
316 | 316 |
317 // | 317 // |
318 // compile, UText mode | 318 // compile, UText mode |
319 // | 319 // |
320 RegexPattern * U_EXPORT2 | 320 RegexPattern * U_EXPORT2 |
321 RegexPattern::compile(UText *regex, | 321 RegexPattern::compile(UText *regex, |
322 uint32_t flags, | 322 uint32_t flags, |
(...skipping 25 matching lines...) Expand all Loading... |
348 } | 348 } |
349 if (U_FAILURE(This->fDeferredStatus)) { | 349 if (U_FAILURE(This->fDeferredStatus)) { |
350 status = This->fDeferredStatus; | 350 status = This->fDeferredStatus; |
351 delete This; | 351 delete This; |
352 return NULL; | 352 return NULL; |
353 } | 353 } |
354 This->fFlags = flags; | 354 This->fFlags = flags; |
355 | 355 |
356 RegexCompile compiler(This, status); | 356 RegexCompile compiler(This, status); |
357 compiler.compile(regex, pe, status); | 357 compiler.compile(regex, pe, status); |
358 | 358 |
359 if (U_FAILURE(status)) { | 359 if (U_FAILURE(status)) { |
360 delete This; | 360 delete This; |
361 This = NULL; | 361 This = NULL; |
362 } | 362 } |
363 | 363 |
364 return This; | 364 return This; |
365 } | 365 } |
366 | 366 |
367 // | 367 // |
368 // compile with default flags. | 368 // compile with default flags. |
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
531 UnicodeString RegexPattern::pattern() const { | 531 UnicodeString RegexPattern::pattern() const { |
532 if (fPatternString != NULL) { | 532 if (fPatternString != NULL) { |
533 return *fPatternString; | 533 return *fPatternString; |
534 } else if (fPattern == NULL) { | 534 } else if (fPattern == NULL) { |
535 return UnicodeString(); | 535 return UnicodeString(); |
536 } else { | 536 } else { |
537 UErrorCode status = U_ZERO_ERROR; | 537 UErrorCode status = U_ZERO_ERROR; |
538 int64_t nativeLen = utext_nativeLength(fPattern); | 538 int64_t nativeLen = utext_nativeLength(fPattern); |
539 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status);
// buffer overflow error | 539 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status);
// buffer overflow error |
540 UnicodeString result; | 540 UnicodeString result; |
541 | 541 |
542 status = U_ZERO_ERROR; | 542 status = U_ZERO_ERROR; |
543 UChar *resultChars = result.getBuffer(len16); | 543 UChar *resultChars = result.getBuffer(len16); |
544 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // u
nterminated warning | 544 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // u
nterminated warning |
545 result.releaseBuffer(len16); | 545 result.releaseBuffer(len16); |
546 | 546 |
547 return result; | 547 return result; |
548 } | 548 } |
549 } | 549 } |
550 | 550 |
551 | 551 |
552 | 552 |
553 | 553 |
554 //--------------------------------------------------------------------- | 554 //--------------------------------------------------------------------- |
555 // | 555 // |
556 // patternText | 556 // patternText |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
615 } | 615 } |
616 | 616 |
617 | 617 |
618 | 618 |
619 //--------------------------------------------------------------------- | 619 //--------------------------------------------------------------------- |
620 // | 620 // |
621 // dump Output the compiled form of the pattern. | 621 // dump Output the compiled form of the pattern. |
622 // Debugging function only. | 622 // Debugging function only. |
623 // | 623 // |
624 //--------------------------------------------------------------------- | 624 //--------------------------------------------------------------------- |
| 625 void RegexPattern::dumpOp(int32_t index) const { |
| 626 (void)index; // Suppress warnings in non-debug build. |
625 #if defined(REGEX_DEBUG) | 627 #if defined(REGEX_DEBUG) |
626 void RegexPattern::dumpOp(int32_t index) const { | |
627 static const char * const opNames[] = {URX_OPCODE_NAMES}; | 628 static const char * const opNames[] = {URX_OPCODE_NAMES}; |
628 int32_t op = fCompiledPat->elementAti(index); | 629 int32_t op = fCompiledPat->elementAti(index); |
629 int32_t val = URX_VAL(op); | 630 int32_t val = URX_VAL(op); |
630 int32_t type = URX_TYPE(op); | 631 int32_t type = URX_TYPE(op); |
631 int32_t pinnedType = type; | 632 int32_t pinnedType = type; |
632 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { | 633 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { |
633 pinnedType = 0; | 634 pinnedType = 0; |
634 } | 635 } |
635 | 636 |
636 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedT
ype])); | 637 printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); |
637 switch (type) { | 638 switch (type) { |
638 case URX_NOP: | 639 case URX_NOP: |
639 case URX_DOTANY: | 640 case URX_DOTANY: |
640 case URX_DOTANY_ALL: | 641 case URX_DOTANY_ALL: |
641 case URX_FAIL: | 642 case URX_FAIL: |
642 case URX_CARET: | 643 case URX_CARET: |
643 case URX_DOLLAR: | 644 case URX_DOLLAR: |
644 case URX_BACKSLASH_G: | 645 case URX_BACKSLASH_G: |
645 case URX_BACKSLASH_X: | 646 case URX_BACKSLASH_X: |
646 case URX_END: | 647 case URX_END: |
(...skipping 28 matching lines...) Expand all Loading... |
675 case URX_LA_END: | 676 case URX_LA_END: |
676 case URX_BACKREF_I: | 677 case URX_BACKREF_I: |
677 case URX_LB_START: | 678 case URX_LB_START: |
678 case URX_LB_CONT: | 679 case URX_LB_CONT: |
679 case URX_LB_END: | 680 case URX_LB_END: |
680 case URX_LBN_CONT: | 681 case URX_LBN_CONT: |
681 case URX_LBN_END: | 682 case URX_LBN_END: |
682 case URX_LOOP_C: | 683 case URX_LOOP_C: |
683 case URX_LOOP_DOT_I: | 684 case URX_LOOP_DOT_I: |
684 // types with an integer operand field. | 685 // types with an integer operand field. |
685 REGEX_DUMP_DEBUG_PRINTF(("%d", val)); | 686 printf("%d", val); |
686 break; | 687 break; |
687 | 688 |
688 case URX_ONECHAR: | 689 case URX_ONECHAR: |
689 case URX_ONECHAR_I: | 690 case URX_ONECHAR_I: |
690 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); | 691 printf("%c", val<256?val:'?'); |
691 break; | 692 break; |
692 | 693 |
693 case URX_STRING: | 694 case URX_STRING: |
694 case URX_STRING_I: | 695 case URX_STRING_I: |
695 { | 696 { |
696 int32_t lengthOp = fCompiledPat->elementAti(index+1); | 697 int32_t lengthOp = fCompiledPat->elementAti(index+1); |
697 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); | 698 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); |
698 int32_t length = URX_VAL(lengthOp); | 699 int32_t length = URX_VAL(lengthOp); |
699 int32_t i; | 700 int32_t i; |
700 for (i=val; i<val+length; i++) { | 701 for (i=val; i<val+length; i++) { |
701 UChar c = fLiteralText[i]; | 702 UChar c = fLiteralText[i]; |
702 if (c < 32 || c >= 256) {c = '.';} | 703 if (c < 32 || c >= 256) {c = '.';} |
703 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 704 printf("%c", c); |
704 } | 705 } |
705 } | 706 } |
706 break; | 707 break; |
707 | 708 |
708 case URX_SETREF: | 709 case URX_SETREF: |
709 case URX_LOOP_SR_I: | 710 case URX_LOOP_SR_I: |
710 { | 711 { |
711 UnicodeString s; | 712 UnicodeString s; |
712 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); | 713 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); |
713 set->toPattern(s, TRUE); | 714 set->toPattern(s, TRUE); |
714 for (int32_t i=0; i<s.length(); i++) { | 715 for (int32_t i=0; i<s.length(); i++) { |
715 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); | 716 printf("%c", s.charAt(i)); |
716 } | 717 } |
717 } | 718 } |
718 break; | 719 break; |
719 | 720 |
720 case URX_STATIC_SETREF: | 721 case URX_STATIC_SETREF: |
721 case URX_STAT_SETREF_N: | 722 case URX_STAT_SETREF_N: |
722 { | 723 { |
723 UnicodeString s; | 724 UnicodeString s; |
724 if (val & URX_NEG_SET) { | 725 if (val & URX_NEG_SET) { |
725 REGEX_DUMP_DEBUG_PRINTF(("NOT ")); | 726 printf("NOT "); |
726 val &= ~URX_NEG_SET; | 727 val &= ~URX_NEG_SET; |
727 } | 728 } |
728 UnicodeSet *set = fStaticSets[val]; | 729 UnicodeSet *set = fStaticSets[val]; |
729 set->toPattern(s, TRUE); | 730 set->toPattern(s, TRUE); |
730 for (int32_t i=0; i<s.length(); i++) { | 731 for (int32_t i=0; i<s.length(); i++) { |
731 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); | 732 printf("%c", s.charAt(i)); |
732 } | 733 } |
733 } | 734 } |
734 break; | 735 break; |
735 | 736 |
736 | 737 |
737 default: | 738 default: |
738 REGEX_DUMP_DEBUG_PRINTF(("??????")); | 739 printf("??????"); |
739 break; | 740 break; |
740 } | 741 } |
741 REGEX_DUMP_DEBUG_PRINTF(("\n")); | 742 printf("\n"); |
| 743 #endif |
742 } | 744 } |
743 #endif | |
744 | 745 |
745 | 746 |
| 747 void RegexPattern::dumpPattern() const { |
746 #if defined(REGEX_DEBUG) | 748 #if defined(REGEX_DEBUG) |
747 U_CAPI void U_EXPORT2 | |
748 RegexPatternDump(const RegexPattern *This) { | |
749 int index; | 749 int index; |
750 int i; | 750 int i; |
751 | 751 |
752 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: ")); | 752 printf("Original Pattern: "); |
753 UChar32 c = utext_next32From(This->fPattern, 0); | 753 UChar32 c = utext_next32From(fPattern, 0); |
754 while (c != U_SENTINEL) { | 754 while (c != U_SENTINEL) { |
755 if (c<32 || c>256) { | 755 if (c<32 || c>256) { |
756 c = '.'; | 756 c = '.'; |
757 } | 757 } |
758 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | 758 printf("%c", c); |
759 | 759 |
760 c = UTEXT_NEXT32(This->fPattern); | 760 c = UTEXT_NEXT32(fPattern); |
761 } | 761 } |
762 REGEX_DUMP_DEBUG_PRINTF(("\n")); | 762 printf("\n"); |
763 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); | 763 printf(" Min Match Length: %d\n", fMinMatchLen); |
764 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(Th
is->fStartType))); | 764 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType)); |
765 if (This->fStartType == START_STRING) { | 765 if (fStartType == START_STRING) { |
766 REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); | 766 printf(" Initial match string: \""); |
767 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitial
StringLen; i++) { | 767 for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) { |
768 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO:
non-printables, surrogates. | 768 printf("%c", fLiteralText[i]); // TODO: non-printables, surrogate
s. |
769 } | 769 } |
770 REGEX_DUMP_DEBUG_PRINTF(("\"\n")); | 770 printf("\"\n"); |
771 | 771 |
772 } else if (This->fStartType == START_SET) { | 772 } else if (fStartType == START_SET) { |
773 int32_t numSetChars = This->fInitialChars->size(); | 773 int32_t numSetChars = fInitialChars->size(); |
774 if (numSetChars > 20) { | 774 if (numSetChars > 20) { |
775 numSetChars = 20; | 775 numSetChars = 20; |
776 } | 776 } |
777 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); | 777 printf(" Match First Chars : "); |
778 for (i=0; i<numSetChars; i++) { | 778 for (i=0; i<numSetChars; i++) { |
779 UChar32 c = This->fInitialChars->charAt(i); | 779 UChar32 c = fInitialChars->charAt(i); |
780 if (0x20<c && c <0x7e) { | 780 if (0x20<c && c <0x7e) { |
781 REGEX_DUMP_DEBUG_PRINTF(("%c ", c)); | 781 printf("%c ", c); |
782 } else { | 782 } else { |
783 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); | 783 printf("%#x ", c); |
784 } | 784 } |
785 } | 785 } |
786 if (numSetChars < This->fInitialChars->size()) { | 786 if (numSetChars < fInitialChars->size()) { |
787 REGEX_DUMP_DEBUG_PRINTF((" ...")); | 787 printf(" ..."); |
788 } | 788 } |
789 REGEX_DUMP_DEBUG_PRINTF(("\n")); | 789 printf("\n"); |
790 | 790 |
791 } else if (This->fStartType == START_CHAR) { | 791 } else if (fStartType == START_CHAR) { |
792 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : ")); | 792 printf(" First char of Match : "); |
793 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) { | 793 if (0x20 < fInitialChar && fInitialChar<0x7e) { |
794 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar)); | 794 printf("%c\n", fInitialChar); |
795 } else { | 795 } else { |
796 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar)); | 796 printf("%#x\n", fInitialChar); |
797 } | 797 } |
798 } | 798 } |
799 | 799 |
800 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \ | 800 printf("\nIndex Binary Type Operand\n" \ |
801 "-------------------------------------------\n")); | 801 "-------------------------------------------\n"); |
802 for (index = 0; index<This->fCompiledPat->size(); index++) { | 802 for (index = 0; index<fCompiledPat->size(); index++) { |
803 This->dumpOp(index); | 803 dumpOp(index); |
804 } | 804 } |
805 REGEX_DUMP_DEBUG_PRINTF(("\n\n")); | 805 printf("\n\n"); |
| 806 #endif |
806 } | 807 } |
807 #endif | |
808 | 808 |
809 | 809 |
810 | 810 |
811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) | 811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
812 | 812 |
813 U_NAMESPACE_END | 813 U_NAMESPACE_END |
814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | 814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
OLD | NEW |