OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2002-2014, International Business Machines | 3 * Copyright (C) 2002-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 * file name: regex.h | 6 * file name: regex.h |
7 * encoding: US-ASCII | 7 * encoding: US-ASCII |
8 * indentation:4 | 8 * indentation:4 |
9 * | 9 * |
10 * created on: 2002oct22 | 10 * created on: 2002oct22 |
11 * created by: Andy Heninger | 11 * created by: Andy Heninger |
12 * | 12 * |
13 * ICU Regular Expressions, API for C++ | 13 * ICU Regular Expressions, API for C++ |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
48 | 48 |
49 #include "unicode/uobject.h" | 49 #include "unicode/uobject.h" |
50 #include "unicode/unistr.h" | 50 #include "unicode/unistr.h" |
51 #include "unicode/utext.h" | 51 #include "unicode/utext.h" |
52 #include "unicode/parseerr.h" | 52 #include "unicode/parseerr.h" |
53 | 53 |
54 #include "unicode/uregex.h" | 54 #include "unicode/uregex.h" |
55 | 55 |
56 // Forward Declarations | 56 // Forward Declarations |
57 | 57 |
| 58 struct UHashtable; |
| 59 |
58 U_NAMESPACE_BEGIN | 60 U_NAMESPACE_BEGIN |
59 | 61 |
60 struct Regex8BitSet; | 62 struct Regex8BitSet; |
61 class RegexCImpl; | 63 class RegexCImpl; |
62 class RegexMatcher; | 64 class RegexMatcher; |
63 class RegexPattern; | 65 class RegexPattern; |
64 struct REStackFrame; | 66 struct REStackFrame; |
65 class RuleBasedBreakIterator; | 67 class RuleBasedBreakIterator; |
66 class UnicodeSet; | 68 class UnicodeSet; |
67 class UVector; | 69 class UVector; |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
129 | 131 |
130 /** | 132 /** |
131 * Assignment operator. After assignment, this RegexPattern will behave ide
ntically | 133 * Assignment operator. After assignment, this RegexPattern will behave ide
ntically |
132 * to the source object. | 134 * to the source object. |
133 * @stable ICU 2.4 | 135 * @stable ICU 2.4 |
134 */ | 136 */ |
135 RegexPattern &operator =(const RegexPattern &source); | 137 RegexPattern &operator =(const RegexPattern &source); |
136 | 138 |
137 /** | 139 /** |
138 * Create an exact copy of this RegexPattern object. Since RegexPattern is
not | 140 * Create an exact copy of this RegexPattern object. Since RegexPattern is
not |
139 * intended to be subclasses, <code>clone()</code> and the copy construction
are | 141 * intended to be subclassed, <code>clone()</code> and the copy construction
are |
140 * equivalent operations. | 142 * equivalent operations. |
141 * @return the copy of this RegexPattern | 143 * @return the copy of this RegexPattern |
142 * @stable ICU 2.4 | 144 * @stable ICU 2.4 |
143 */ | 145 */ |
144 virtual RegexPattern *clone() const; | 146 virtual RegexPattern *clone() const; |
145 | 147 |
146 | 148 |
147 /** | 149 /** |
148 * Compiles the regular expression in string form into a RegexPattern | 150 * Compiles the regular expression in string form into a RegexPattern |
149 * object. These compile methods, rather than the constructors, are the usua
l | 151 * object. These compile methods, rather than the constructors, are the usua
l |
(...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
431 * Note: This is the original input, not a clone. If the pattern was original
ly compiled from a | 433 * Note: This is the original input, not a clone. If the pattern was original
ly compiled from a |
432 * UText, and that UText was modified, the returned UText may no longer refle
ct the RegexPattern | 434 * UText, and that UText was modified, the returned UText may no longer refle
ct the RegexPattern |
433 * object. | 435 * object. |
434 * | 436 * |
435 * @stable ICU 4.6 | 437 * @stable ICU 4.6 |
436 */ | 438 */ |
437 virtual UText *patternText(UErrorCode &status) const; | 439 virtual UText *patternText(UErrorCode &status) const; |
438 | 440 |
439 | 441 |
440 /** | 442 /** |
| 443 * Get the group number corresponding to a named capture group. |
| 444 * The returned number can be used with any function that access |
| 445 * capture groups by number. |
| 446 * |
| 447 * The function returns an error status if the specified name does not |
| 448 * appear in the pattern. |
| 449 * |
| 450 * @param groupName The capture group name. |
| 451 * @param status A UErrorCode to receive any errors. |
| 452 * |
| 453 * @draft ICU 55 |
| 454 */ |
| 455 virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCo
de &status) const; |
| 456 |
| 457 |
| 458 /** |
| 459 * Get the group number corresponding to a named capture group. |
| 460 * The returned number can be used with any function that access |
| 461 * capture groups by number. |
| 462 * |
| 463 * The function returns an error status if the specified name does not |
| 464 * appear in the pattern. |
| 465 * |
| 466 * @param groupName The capture group name, |
| 467 * platform invariant characters only. |
| 468 * @param nameLength The length of the name, or -1 if the name is |
| 469 * nul-terminated. |
| 470 * @param status A UErrorCode to receive any errors. |
| 471 * |
| 472 * @draft ICU 55 |
| 473 */ |
| 474 virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLengt
h, UErrorCode &status) const; |
| 475 |
| 476 |
| 477 /** |
441 * Split a string into fields. Somewhat like split() from Perl or Java. | 478 * Split a string into fields. Somewhat like split() from Perl or Java. |
442 * Pattern matches identify delimiters that separate the input | 479 * Pattern matches identify delimiters that separate the input |
443 * into fields. The input data between the delimiters becomes the | 480 * into fields. The input data between the delimiters becomes the |
444 * fields themselves. | 481 * fields themselves. |
445 * | 482 * |
446 * If the delimiter pattern includes capture groups, the captured text will | 483 * If the delimiter pattern includes capture groups, the captured text will |
447 * also appear in the destination array of output strings, interspersed | 484 * also appear in the destination array of output strings, interspersed |
448 * with the fields. This is similar to Perl, but differs from Java, | 485 * with the fields. This is similar to Perl, but differs from Java, |
449 * which ignores the presence of capture groups in the pattern. | 486 * which ignores the presence of capture groups in the pattern. |
450 * | 487 * |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
566 int32_t fFrameSize; // Size of a state stack frame in the | 603 int32_t fFrameSize; // Size of a state stack frame in the |
567 // execution engine. | 604 // execution engine. |
568 | 605 |
569 int32_t fDataSize; // The size of the data needed by the pattern
that | 606 int32_t fDataSize; // The size of the data needed by the pattern
that |
570 // does not go on the state stack, but has
just | 607 // does not go on the state stack, but has
just |
571 // a single copy per matcher. | 608 // a single copy per matcher. |
572 | 609 |
573 UVector32 *fGroupMap; // Map from capture group number to position
of | 610 UVector32 *fGroupMap; // Map from capture group number to position
of |
574 // the group's variables in the matcher sta
ck frame. | 611 // the group's variables in the matcher sta
ck frame. |
575 | 612 |
576 int32_t fMaxCaptureDigits; | |
577 | |
578 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined | 613 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined |
579 // regex character classes, e.g. Word. | 614 // regex character classes, e.g. Word. |
580 | 615 |
581 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only | 616 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only |
582 // sets for predefined regex classes. | 617 // sets for predefined regex classes. |
583 | 618 |
584 int32_t fStartType; // Info on how a match must start. | 619 int32_t fStartType; // Info on how a match must start. |
585 int32_t fInitialStringIdx; // | 620 int32_t fInitialStringIdx; // |
586 int32_t fInitialStringLen; | 621 int32_t fInitialStringLen; |
587 UnicodeSet *fInitialChars; | 622 UnicodeSet *fInitialChars; |
588 UChar32 fInitialChar; | 623 UChar32 fInitialChar; |
589 Regex8BitSet *fInitialChars8; | 624 Regex8BitSet *fInitialChars8; |
590 UBool fNeedsAltInput; | 625 UBool fNeedsAltInput; |
591 | 626 |
| 627 UHashtable *fNamedCaptureMap; // Map from capture group names to number
s. |
| 628 |
592 friend class RegexCompile; | 629 friend class RegexCompile; |
593 friend class RegexMatcher; | 630 friend class RegexMatcher; |
594 friend class RegexCImpl; | 631 friend class RegexCImpl; |
595 | 632 |
596 // | 633 // |
597 // Implementation Methods | 634 // Implementation Methods |
598 // | 635 // |
599 void init(); // Common initialization, for use by construc
tors. | 636 void init(); // Common initialization, for use by construc
tors. |
600 void zap(); // Common cleanup | 637 void zap(); // Common cleanup |
601 | 638 |
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
805 * Find the next pattern match in the input string. | 842 * Find the next pattern match in the input string. |
806 * The find begins searching the input at the location following the end of | 843 * The find begins searching the input at the location following the end of |
807 * the previous match, or at the start of the string if there is no previous
match. | 844 * the previous match, or at the start of the string if there is no previous
match. |
808 * If a match is found, <code>start(), end()</code> and <code>group()</code> | 845 * If a match is found, <code>start(), end()</code> and <code>group()</code> |
809 * will provide more information regarding the match. | 846 * will provide more information regarding the match. |
810 * <p>Note that if the input string is changed by the application, | 847 * <p>Note that if the input string is changed by the application, |
811 * use find(startPos, status) instead of find(), because the saved starti
ng | 848 * use find(startPos, status) instead of find(), because the saved starti
ng |
812 * position may not be valid with the altered input string.</p> | 849 * position may not be valid with the altered input string.</p> |
813 * @param status A reference to a UErrorCode to receive any errors. | 850 * @param status A reference to a UErrorCode to receive any errors. |
814 * @return TRUE if a match is found. | 851 * @return TRUE if a match is found. |
815 * @internal | 852 * @draft ICU 55 |
816 */ | 853 */ |
817 virtual UBool find(UErrorCode &status); | 854 virtual UBool find(UErrorCode &status); |
818 | 855 |
819 /** | 856 /** |
820 * Resets this RegexMatcher and then attempts to find the next substring of
the | 857 * Resets this RegexMatcher and then attempts to find the next substring of
the |
821 * input string that matches the pattern, starting at the specified index. | 858 * input string that matches the pattern, starting at the specified index. |
822 * | 859 * |
823 * @param start The (native) index in the input string to begin the s
earch. | 860 * @param start The (native) index in the input string to begin the s
earch. |
824 * @param status A reference to a UErrorCode to receive any errors. | 861 * @param status A reference to a UErrorCode to receive any errors. |
825 * @return TRUE if a match is found. | 862 * @return TRUE if a match is found. |
(...skipping 11 matching lines...) Expand all Loading... |
837 * @return a string containing the matched input text. | 874 * @return a string containing the matched input text. |
838 * @stable ICU 2.4 | 875 * @stable ICU 2.4 |
839 */ | 876 */ |
840 virtual UnicodeString group(UErrorCode &status) const; | 877 virtual UnicodeString group(UErrorCode &status) const; |
841 | 878 |
842 | 879 |
843 /** | 880 /** |
844 * Returns a string containing the text captured by the given group | 881 * Returns a string containing the text captured by the given group |
845 * during the previous match operation. Group(0) is the entire match. | 882 * during the previous match operation. Group(0) is the entire match. |
846 * | 883 * |
| 884 * A zero length string is returned both for capture groups that did not |
| 885 * participate in the match and for actual zero length matches. |
| 886 * To distinguish between these two cases use the function start(), |
| 887 * which returns -1 for non-participating groups. |
| 888 * |
847 * @param groupNum the capture group number | 889 * @param groupNum the capture group number |
848 * @param status A reference to a UErrorCode to receive any errors. | 890 * @param status A reference to a UErrorCode to receive any errors. |
849 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch | 891 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch |
850 * has been attempted or the last match failed and | 892 * has been attempted or the last match failed and |
851 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. | 893 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. |
852 * @return the captured text | 894 * @return the captured text |
853 * @stable ICU 2.4 | 895 * @stable ICU 2.4 |
854 */ | 896 */ |
855 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; | 897 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; |
856 | 898 |
857 | |
858 /** | 899 /** |
859 * Returns the number of capturing groups in this matcher's pattern. | 900 * Returns the number of capturing groups in this matcher's pattern. |
860 * @return the number of capture groups | 901 * @return the number of capture groups |
861 * @stable ICU 2.4 | 902 * @stable ICU 2.4 |
862 */ | 903 */ |
863 virtual int32_t groupCount() const; | 904 virtual int32_t groupCount() const; |
864 | 905 |
865 | 906 |
866 /** | 907 /** |
867 * Returns a shallow clone of the entire live input string with the UText c
urrent native index | 908 * Returns a shallow clone of the entire live input string with the UText c
urrent native index |
868 * set to the beginning of the requested group. | 909 * set to the beginning of the requested group. |
869 * | 910 * |
870 * @param dest The UText into which the input should be cloned, or
NULL to create a new UText | 911 * @param dest The UText into which the input should be cloned, or
NULL to create a new UText |
871 * @param group_len A reference to receive the length of the desired ca
pture group | 912 * @param group_len A reference to receive the length of the desired ca
pture group |
872 * @param status A reference to a UErrorCode to receive any errors. | 913 * @param status A reference to a UErrorCode to receive any errors. |
873 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch | 914 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch |
874 * has been attempted or the last match failed and | 915 * has been attempted or the last match failed and |
875 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. | 916 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. |
876 * @return dest if non-NULL, a shallow copy of the input text otherwise | 917 * @return dest if non-NULL, a shallow copy of the input text otherwise |
877 * | 918 * |
878 * @stable ICU 4.6 | 919 * @stable ICU 4.6 |
879 */ | 920 */ |
880 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) co
nst; | 921 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) co
nst; |
881 | 922 |
882 /** | 923 /** |
883 * Returns a shallow clone of the entire live input string with the UText c
urrent native index | 924 * Returns a shallow clone of the entire live input string with the UText c
urrent native index |
884 * set to the beginning of the requested group. | 925 * set to the beginning of the requested group. |
885 * | 926 * |
| 927 * A group length of zero is returned both for capture groups that did not |
| 928 * participate in the match and for actual zero length matches. |
| 929 * To distinguish between these two cases use the function start(), |
| 930 * which returns -1 for non-participating groups. |
| 931 * |
886 * @param groupNum The capture group number. | 932 * @param groupNum The capture group number. |
887 * @param dest The UText into which the input should be cloned, or
NULL to create a new UText. | 933 * @param dest The UText into which the input should be cloned, or
NULL to create a new UText. |
888 * @param group_len A reference to receive the length of the desired ca
pture group | 934 * @param group_len A reference to receive the length of the desired ca
pture group |
889 * @param status A reference to a UErrorCode to receive any errors. | 935 * @param status A reference to a UErrorCode to receive any errors. |
890 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch | 936 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch |
891 * has been attempted or the last match failed and | 937 * has been attempted or the last match failed and |
892 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. | 938 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. |
893 * @return dest if non-NULL, a shallow copy of the input text otherwise | 939 * @return dest if non-NULL, a shallow copy of the input text otherwise |
894 * | 940 * |
895 * @stable ICU 4.6 | 941 * @stable ICU 4.6 |
896 */ | 942 */ |
897 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErr
orCode &status) const; | 943 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErr
orCode &status) const; |
898 | 944 |
899 /** | 945 /** |
900 * Returns a string containing the text captured by the given group | |
901 * during the previous match operation. Group(0) is the entire match. | |
902 * | |
903 * @param groupNum the capture group number | |
904 * @param dest A mutable UText in which the matching text is place
d. | |
905 * If NULL, a new UText will be created (which may not
be mutable). | |
906 * @param status A reference to a UErrorCode to receive any errors. | |
907 * Possible errors are U_REGEX_INVALID_STATE if no ma
tch | |
908 * has been attempted or the last match failed. | |
909 * @return A string containing the matched input text. If a pre-allocated
UText | |
910 * was provided, it will always be used and returned. | |
911 * | |
912 * @internal ICU 4.4 technology preview | |
913 */ | |
914 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) cons
t; | |
915 | |
916 | |
917 /** | |
918 * Returns the index in the input string of the start of the text matched | 946 * Returns the index in the input string of the start of the text matched |
919 * during the previous match operation. | 947 * during the previous match operation. |
920 * @param status a reference to a UErrorCode to receive any errors. | 948 * @param status a reference to a UErrorCode to receive any errors. |
921 * @return The (native) position in the input string of the s
tart of the last match. | 949 * @return The (native) position in the input string of the s
tart of the last match. |
922 * @stable ICU 2.4 | 950 * @stable ICU 2.4 |
923 */ | 951 */ |
924 virtual int32_t start(UErrorCode &status) const; | 952 virtual int32_t start(UErrorCode &status) const; |
925 | 953 |
926 /** | 954 /** |
927 * Returns the index in the input string of the start of the text matched | 955 * Returns the index in the input string of the start of the text matched |
(...skipping 28 matching lines...) Expand all Loading... |
956 * @param group the capture group number. | 984 * @param group the capture group number. |
957 * @param status A reference to a UErrorCode to receive any errors.
Possible | 985 * @param status A reference to a UErrorCode to receive any errors.
Possible |
958 * errors are U_REGEX_INVALID_STATE if no match has b
een | 986 * errors are U_REGEX_INVALID_STATE if no match has b
een |
959 * attempted or the last match failed, and | 987 * attempted or the last match failed, and |
960 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. | 988 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n
umber. |
961 * @return the (native) start position of substring matched by the specifi
ed group. | 989 * @return the (native) start position of substring matched by the specifi
ed group. |
962 * @stable ICU 4.6 | 990 * @stable ICU 4.6 |
963 */ | 991 */ |
964 virtual int64_t start64(int32_t group, UErrorCode &status) const; | 992 virtual int64_t start64(int32_t group, UErrorCode &status) const; |
965 | 993 |
966 | |
967 /** | 994 /** |
968 * Returns the index in the input string of the first character following
the | 995 * Returns the index in the input string of the first character following
the |
969 * text matched during the previous match operation. | 996 * text matched during the previous match operation. |
970 * | 997 * |
971 * @param status A reference to a UErrorCode to receive any errors.
Possible | 998 * @param status A reference to a UErrorCode to receive any errors.
Possible |
972 * errors are U_REGEX_INVALID_STATE if no match has b
een | 999 * errors are U_REGEX_INVALID_STATE if no match has b
een |
973 * attempted or the last match failed. | 1000 * attempted or the last match failed. |
974 * @return the index of the last character matched, plus one. | 1001 * @return the index of the last character matched, plus one. |
975 * The index value returned is a native index, corresp
onding to | 1002 * The index value returned is a native index, corresp
onding to |
976 * code units for the underlying encoding type, for ex
ample, | 1003 * code units for the underlying encoding type, for ex
ample, |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1026 * @return the index of the first character following the text | 1053 * @return the index of the first character following the text |
1027 * captured by the specified group during the previous match ope
ration. | 1054 * captured by the specified group during the previous match ope
ration. |
1028 * Return -1 if the capture group exists in the pattern but was
not part of the match. | 1055 * Return -1 if the capture group exists in the pattern but was
not part of the match. |
1029 * The index value returned is a native index, corresponding to | 1056 * The index value returned is a native index, corresponding to |
1030 * code units for the underlying encoding type, for example, | 1057 * code units for the underlying encoding type, for example, |
1031 * a byte index for UTF8. | 1058 * a byte index for UTF8. |
1032 * @stable ICU 4.6 | 1059 * @stable ICU 4.6 |
1033 */ | 1060 */ |
1034 virtual int64_t end64(int32_t group, UErrorCode &status) const; | 1061 virtual int64_t end64(int32_t group, UErrorCode &status) const; |
1035 | 1062 |
1036 | |
1037 /** | 1063 /** |
1038 * Resets this matcher. The effect is to remove any memory of previous mat
ches, | 1064 * Resets this matcher. The effect is to remove any memory of previous mat
ches, |
1039 * and to cause subsequent find() operations to begin at the beginning
of | 1065 * and to cause subsequent find() operations to begin at the beginning
of |
1040 * the input string. | 1066 * the input string. |
1041 * | 1067 * |
1042 * @return this RegexMatcher. | 1068 * @return this RegexMatcher. |
1043 * @stable ICU 2.4 | 1069 * @stable ICU 2.4 |
1044 */ | 1070 */ |
1045 virtual RegexMatcher &reset(); | 1071 virtual RegexMatcher &reset(); |
1046 | 1072 |
(...skipping 801 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1848 | 1874 |
1849 UErrorCode fDeferredStatus; // Save error state that cannot be im
mediately | 1875 UErrorCode fDeferredStatus; // Save error state that cannot be im
mediately |
1850 // reported, or that permanently di
sables this matcher. | 1876 // reported, or that permanently di
sables this matcher. |
1851 | 1877 |
1852 RuleBasedBreakIterator *fWordBreakItr; | 1878 RuleBasedBreakIterator *fWordBreakItr; |
1853 }; | 1879 }; |
1854 | 1880 |
1855 U_NAMESPACE_END | 1881 U_NAMESPACE_END |
1856 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS | 1882 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS |
1857 #endif | 1883 #endif |
OLD | NEW |