Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(472)

Side by Side Diff: source/i18n/unicode/regex.h

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/unicode/rbnf.h ('k') | source/i18n/unicode/region.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ********************************************************************** 2 **********************************************************************
3 * Copyright (C) 2002-2014, International Business Machines 3 * Copyright (C) 2002-2015, International Business Machines
4 * Corporation and others. All Rights Reserved. 4 * Corporation and others. All Rights Reserved.
5 ********************************************************************** 5 **********************************************************************
6 * file name: regex.h 6 * file name: regex.h
7 * encoding: US-ASCII 7 * encoding: US-ASCII
8 * indentation:4 8 * indentation:4
9 * 9 *
10 * created on: 2002oct22 10 * created on: 2002oct22
11 * created by: Andy Heninger 11 * created by: Andy Heninger
12 * 12 *
13 * ICU Regular Expressions, API for C++ 13 * ICU Regular Expressions, API for C++
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
48 48
49 #include "unicode/uobject.h" 49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h" 50 #include "unicode/unistr.h"
51 #include "unicode/utext.h" 51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h" 52 #include "unicode/parseerr.h"
53 53
54 #include "unicode/uregex.h" 54 #include "unicode/uregex.h"
55 55
56 // Forward Declarations 56 // Forward Declarations
57 57
58 struct UHashtable;
59
58 U_NAMESPACE_BEGIN 60 U_NAMESPACE_BEGIN
59 61
60 struct Regex8BitSet; 62 struct Regex8BitSet;
61 class RegexCImpl; 63 class RegexCImpl;
62 class RegexMatcher; 64 class RegexMatcher;
63 class RegexPattern; 65 class RegexPattern;
64 struct REStackFrame; 66 struct REStackFrame;
65 class RuleBasedBreakIterator; 67 class RuleBasedBreakIterator;
66 class UnicodeSet; 68 class UnicodeSet;
67 class UVector; 69 class UVector;
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
129 131
130 /** 132 /**
131 * Assignment operator. After assignment, this RegexPattern will behave ide ntically 133 * Assignment operator. After assignment, this RegexPattern will behave ide ntically
132 * to the source object. 134 * to the source object.
133 * @stable ICU 2.4 135 * @stable ICU 2.4
134 */ 136 */
135 RegexPattern &operator =(const RegexPattern &source); 137 RegexPattern &operator =(const RegexPattern &source);
136 138
137 /** 139 /**
138 * Create an exact copy of this RegexPattern object. Since RegexPattern is not 140 * Create an exact copy of this RegexPattern object. Since RegexPattern is not
139 * intended to be subclasses, <code>clone()</code> and the copy construction are 141 * intended to be subclassed, <code>clone()</code> and the copy construction are
140 * equivalent operations. 142 * equivalent operations.
141 * @return the copy of this RegexPattern 143 * @return the copy of this RegexPattern
142 * @stable ICU 2.4 144 * @stable ICU 2.4
143 */ 145 */
144 virtual RegexPattern *clone() const; 146 virtual RegexPattern *clone() const;
145 147
146 148
147 /** 149 /**
148 * Compiles the regular expression in string form into a RegexPattern 150 * Compiles the regular expression in string form into a RegexPattern
149 * object. These compile methods, rather than the constructors, are the usua l 151 * object. These compile methods, rather than the constructors, are the usua l
(...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after
431 * Note: This is the original input, not a clone. If the pattern was original ly compiled from a 433 * Note: This is the original input, not a clone. If the pattern was original ly compiled from a
432 * UText, and that UText was modified, the returned UText may no longer refle ct the RegexPattern 434 * UText, and that UText was modified, the returned UText may no longer refle ct the RegexPattern
433 * object. 435 * object.
434 * 436 *
435 * @stable ICU 4.6 437 * @stable ICU 4.6
436 */ 438 */
437 virtual UText *patternText(UErrorCode &status) const; 439 virtual UText *patternText(UErrorCode &status) const;
438 440
439 441
440 /** 442 /**
443 * Get the group number corresponding to a named capture group.
444 * The returned number can be used with any function that access
445 * capture groups by number.
446 *
447 * The function returns an error status if the specified name does not
448 * appear in the pattern.
449 *
450 * @param groupName The capture group name.
451 * @param status A UErrorCode to receive any errors.
452 *
453 * @draft ICU 55
454 */
455 virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCo de &status) const;
456
457
458 /**
459 * Get the group number corresponding to a named capture group.
460 * The returned number can be used with any function that access
461 * capture groups by number.
462 *
463 * The function returns an error status if the specified name does not
464 * appear in the pattern.
465 *
466 * @param groupName The capture group name,
467 * platform invariant characters only.
468 * @param nameLength The length of the name, or -1 if the name is
469 * nul-terminated.
470 * @param status A UErrorCode to receive any errors.
471 *
472 * @draft ICU 55
473 */
474 virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLengt h, UErrorCode &status) const;
475
476
477 /**
441 * Split a string into fields. Somewhat like split() from Perl or Java. 478 * Split a string into fields. Somewhat like split() from Perl or Java.
442 * Pattern matches identify delimiters that separate the input 479 * Pattern matches identify delimiters that separate the input
443 * into fields. The input data between the delimiters becomes the 480 * into fields. The input data between the delimiters becomes the
444 * fields themselves. 481 * fields themselves.
445 * 482 *
446 * If the delimiter pattern includes capture groups, the captured text will 483 * If the delimiter pattern includes capture groups, the captured text will
447 * also appear in the destination array of output strings, interspersed 484 * also appear in the destination array of output strings, interspersed
448 * with the fields. This is similar to Perl, but differs from Java, 485 * with the fields. This is similar to Perl, but differs from Java,
449 * which ignores the presence of capture groups in the pattern. 486 * which ignores the presence of capture groups in the pattern.
450 * 487 *
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
566 int32_t fFrameSize; // Size of a state stack frame in the 603 int32_t fFrameSize; // Size of a state stack frame in the
567 // execution engine. 604 // execution engine.
568 605
569 int32_t fDataSize; // The size of the data needed by the pattern that 606 int32_t fDataSize; // The size of the data needed by the pattern that
570 // does not go on the state stack, but has just 607 // does not go on the state stack, but has just
571 // a single copy per matcher. 608 // a single copy per matcher.
572 609
573 UVector32 *fGroupMap; // Map from capture group number to position of 610 UVector32 *fGroupMap; // Map from capture group number to position of
574 // the group's variables in the matcher sta ck frame. 611 // the group's variables in the matcher sta ck frame.
575 612
576 int32_t fMaxCaptureDigits;
577
578 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 613 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
579 // regex character classes, e.g. Word. 614 // regex character classes, e.g. Word.
580 615
581 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 616 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
582 // sets for predefined regex classes. 617 // sets for predefined regex classes.
583 618
584 int32_t fStartType; // Info on how a match must start. 619 int32_t fStartType; // Info on how a match must start.
585 int32_t fInitialStringIdx; // 620 int32_t fInitialStringIdx; //
586 int32_t fInitialStringLen; 621 int32_t fInitialStringLen;
587 UnicodeSet *fInitialChars; 622 UnicodeSet *fInitialChars;
588 UChar32 fInitialChar; 623 UChar32 fInitialChar;
589 Regex8BitSet *fInitialChars8; 624 Regex8BitSet *fInitialChars8;
590 UBool fNeedsAltInput; 625 UBool fNeedsAltInput;
591 626
627 UHashtable *fNamedCaptureMap; // Map from capture group names to number s.
628
592 friend class RegexCompile; 629 friend class RegexCompile;
593 friend class RegexMatcher; 630 friend class RegexMatcher;
594 friend class RegexCImpl; 631 friend class RegexCImpl;
595 632
596 // 633 //
597 // Implementation Methods 634 // Implementation Methods
598 // 635 //
599 void init(); // Common initialization, for use by construc tors. 636 void init(); // Common initialization, for use by construc tors.
600 void zap(); // Common cleanup 637 void zap(); // Common cleanup
601 638
(...skipping 203 matching lines...) Expand 10 before | Expand all | Expand 10 after
805 * Find the next pattern match in the input string. 842 * Find the next pattern match in the input string.
806 * The find begins searching the input at the location following the end of 843 * The find begins searching the input at the location following the end of
807 * the previous match, or at the start of the string if there is no previous match. 844 * the previous match, or at the start of the string if there is no previous match.
808 * If a match is found, <code>start(), end()</code> and <code>group()</code> 845 * If a match is found, <code>start(), end()</code> and <code>group()</code>
809 * will provide more information regarding the match. 846 * will provide more information regarding the match.
810 * <p>Note that if the input string is changed by the application, 847 * <p>Note that if the input string is changed by the application,
811 * use find(startPos, status) instead of find(), because the saved starti ng 848 * use find(startPos, status) instead of find(), because the saved starti ng
812 * position may not be valid with the altered input string.</p> 849 * position may not be valid with the altered input string.</p>
813 * @param status A reference to a UErrorCode to receive any errors. 850 * @param status A reference to a UErrorCode to receive any errors.
814 * @return TRUE if a match is found. 851 * @return TRUE if a match is found.
815 * @internal 852 * @draft ICU 55
816 */ 853 */
817 virtual UBool find(UErrorCode &status); 854 virtual UBool find(UErrorCode &status);
818 855
819 /** 856 /**
820 * Resets this RegexMatcher and then attempts to find the next substring of the 857 * Resets this RegexMatcher and then attempts to find the next substring of the
821 * input string that matches the pattern, starting at the specified index. 858 * input string that matches the pattern, starting at the specified index.
822 * 859 *
823 * @param start The (native) index in the input string to begin the s earch. 860 * @param start The (native) index in the input string to begin the s earch.
824 * @param status A reference to a UErrorCode to receive any errors. 861 * @param status A reference to a UErrorCode to receive any errors.
825 * @return TRUE if a match is found. 862 * @return TRUE if a match is found.
(...skipping 11 matching lines...) Expand all
837 * @return a string containing the matched input text. 874 * @return a string containing the matched input text.
838 * @stable ICU 2.4 875 * @stable ICU 2.4
839 */ 876 */
840 virtual UnicodeString group(UErrorCode &status) const; 877 virtual UnicodeString group(UErrorCode &status) const;
841 878
842 879
843 /** 880 /**
844 * Returns a string containing the text captured by the given group 881 * Returns a string containing the text captured by the given group
845 * during the previous match operation. Group(0) is the entire match. 882 * during the previous match operation. Group(0) is the entire match.
846 * 883 *
884 * A zero length string is returned both for capture groups that did not
885 * participate in the match and for actual zero length matches.
886 * To distinguish between these two cases use the function start(),
887 * which returns -1 for non-participating groups.
888 *
847 * @param groupNum the capture group number 889 * @param groupNum the capture group number
848 * @param status A reference to a UErrorCode to receive any errors. 890 * @param status A reference to a UErrorCode to receive any errors.
849 * Possible errors are U_REGEX_INVALID_STATE if no ma tch 891 * Possible errors are U_REGEX_INVALID_STATE if no ma tch
850 * has been attempted or the last match failed and 892 * has been attempted or the last match failed and
851 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber. 893 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber.
852 * @return the captured text 894 * @return the captured text
853 * @stable ICU 2.4 895 * @stable ICU 2.4
854 */ 896 */
855 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 897 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
856 898
857
858 /** 899 /**
859 * Returns the number of capturing groups in this matcher's pattern. 900 * Returns the number of capturing groups in this matcher's pattern.
860 * @return the number of capture groups 901 * @return the number of capture groups
861 * @stable ICU 2.4 902 * @stable ICU 2.4
862 */ 903 */
863 virtual int32_t groupCount() const; 904 virtual int32_t groupCount() const;
864 905
865 906
866 /** 907 /**
867 * Returns a shallow clone of the entire live input string with the UText c urrent native index 908 * Returns a shallow clone of the entire live input string with the UText c urrent native index
868 * set to the beginning of the requested group. 909 * set to the beginning of the requested group.
869 * 910 *
870 * @param dest The UText into which the input should be cloned, or NULL to create a new UText 911 * @param dest The UText into which the input should be cloned, or NULL to create a new UText
871 * @param group_len A reference to receive the length of the desired ca pture group 912 * @param group_len A reference to receive the length of the desired ca pture group
872 * @param status A reference to a UErrorCode to receive any errors. 913 * @param status A reference to a UErrorCode to receive any errors.
873 * Possible errors are U_REGEX_INVALID_STATE if no ma tch 914 * Possible errors are U_REGEX_INVALID_STATE if no ma tch
874 * has been attempted or the last match failed and 915 * has been attempted or the last match failed and
875 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber. 916 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber.
876 * @return dest if non-NULL, a shallow copy of the input text otherwise 917 * @return dest if non-NULL, a shallow copy of the input text otherwise
877 * 918 *
878 * @stable ICU 4.6 919 * @stable ICU 4.6
879 */ 920 */
880 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) co nst; 921 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) co nst;
881 922
882 /** 923 /**
883 * Returns a shallow clone of the entire live input string with the UText c urrent native index 924 * Returns a shallow clone of the entire live input string with the UText c urrent native index
884 * set to the beginning of the requested group. 925 * set to the beginning of the requested group.
885 * 926 *
927 * A group length of zero is returned both for capture groups that did not
928 * participate in the match and for actual zero length matches.
929 * To distinguish between these two cases use the function start(),
930 * which returns -1 for non-participating groups.
931 *
886 * @param groupNum The capture group number. 932 * @param groupNum The capture group number.
887 * @param dest The UText into which the input should be cloned, or NULL to create a new UText. 933 * @param dest The UText into which the input should be cloned, or NULL to create a new UText.
888 * @param group_len A reference to receive the length of the desired ca pture group 934 * @param group_len A reference to receive the length of the desired ca pture group
889 * @param status A reference to a UErrorCode to receive any errors. 935 * @param status A reference to a UErrorCode to receive any errors.
890 * Possible errors are U_REGEX_INVALID_STATE if no ma tch 936 * Possible errors are U_REGEX_INVALID_STATE if no ma tch
891 * has been attempted or the last match failed and 937 * has been attempted or the last match failed and
892 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber. 938 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber.
893 * @return dest if non-NULL, a shallow copy of the input text otherwise 939 * @return dest if non-NULL, a shallow copy of the input text otherwise
894 * 940 *
895 * @stable ICU 4.6 941 * @stable ICU 4.6
896 */ 942 */
897 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErr orCode &status) const; 943 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErr orCode &status) const;
898 944
899 /** 945 /**
900 * Returns a string containing the text captured by the given group
901 * during the previous match operation. Group(0) is the entire match.
902 *
903 * @param groupNum the capture group number
904 * @param dest A mutable UText in which the matching text is place d.
905 * If NULL, a new UText will be created (which may not be mutable).
906 * @param status A reference to a UErrorCode to receive any errors.
907 * Possible errors are U_REGEX_INVALID_STATE if no ma tch
908 * has been attempted or the last match failed.
909 * @return A string containing the matched input text. If a pre-allocated UText
910 * was provided, it will always be used and returned.
911 *
912 * @internal ICU 4.4 technology preview
913 */
914 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) cons t;
915
916
917 /**
918 * Returns the index in the input string of the start of the text matched 946 * Returns the index in the input string of the start of the text matched
919 * during the previous match operation. 947 * during the previous match operation.
920 * @param status a reference to a UErrorCode to receive any errors. 948 * @param status a reference to a UErrorCode to receive any errors.
921 * @return The (native) position in the input string of the s tart of the last match. 949 * @return The (native) position in the input string of the s tart of the last match.
922 * @stable ICU 2.4 950 * @stable ICU 2.4
923 */ 951 */
924 virtual int32_t start(UErrorCode &status) const; 952 virtual int32_t start(UErrorCode &status) const;
925 953
926 /** 954 /**
927 * Returns the index in the input string of the start of the text matched 955 * Returns the index in the input string of the start of the text matched
(...skipping 28 matching lines...) Expand all
956 * @param group the capture group number. 984 * @param group the capture group number.
957 * @param status A reference to a UErrorCode to receive any errors. Possible 985 * @param status A reference to a UErrorCode to receive any errors. Possible
958 * errors are U_REGEX_INVALID_STATE if no match has b een 986 * errors are U_REGEX_INVALID_STATE if no match has b een
959 * attempted or the last match failed, and 987 * attempted or the last match failed, and
960 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber. 988 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group n umber.
961 * @return the (native) start position of substring matched by the specifi ed group. 989 * @return the (native) start position of substring matched by the specifi ed group.
962 * @stable ICU 4.6 990 * @stable ICU 4.6
963 */ 991 */
964 virtual int64_t start64(int32_t group, UErrorCode &status) const; 992 virtual int64_t start64(int32_t group, UErrorCode &status) const;
965 993
966
967 /** 994 /**
968 * Returns the index in the input string of the first character following the 995 * Returns the index in the input string of the first character following the
969 * text matched during the previous match operation. 996 * text matched during the previous match operation.
970 * 997 *
971 * @param status A reference to a UErrorCode to receive any errors. Possible 998 * @param status A reference to a UErrorCode to receive any errors. Possible
972 * errors are U_REGEX_INVALID_STATE if no match has b een 999 * errors are U_REGEX_INVALID_STATE if no match has b een
973 * attempted or the last match failed. 1000 * attempted or the last match failed.
974 * @return the index of the last character matched, plus one. 1001 * @return the index of the last character matched, plus one.
975 * The index value returned is a native index, corresp onding to 1002 * The index value returned is a native index, corresp onding to
976 * code units for the underlying encoding type, for ex ample, 1003 * code units for the underlying encoding type, for ex ample,
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
1026 * @return the index of the first character following the text 1053 * @return the index of the first character following the text
1027 * captured by the specified group during the previous match ope ration. 1054 * captured by the specified group during the previous match ope ration.
1028 * Return -1 if the capture group exists in the pattern but was not part of the match. 1055 * Return -1 if the capture group exists in the pattern but was not part of the match.
1029 * The index value returned is a native index, corresponding to 1056 * The index value returned is a native index, corresponding to
1030 * code units for the underlying encoding type, for example, 1057 * code units for the underlying encoding type, for example,
1031 * a byte index for UTF8. 1058 * a byte index for UTF8.
1032 * @stable ICU 4.6 1059 * @stable ICU 4.6
1033 */ 1060 */
1034 virtual int64_t end64(int32_t group, UErrorCode &status) const; 1061 virtual int64_t end64(int32_t group, UErrorCode &status) const;
1035 1062
1036
1037 /** 1063 /**
1038 * Resets this matcher. The effect is to remove any memory of previous mat ches, 1064 * Resets this matcher. The effect is to remove any memory of previous mat ches,
1039 * and to cause subsequent find() operations to begin at the beginning of 1065 * and to cause subsequent find() operations to begin at the beginning of
1040 * the input string. 1066 * the input string.
1041 * 1067 *
1042 * @return this RegexMatcher. 1068 * @return this RegexMatcher.
1043 * @stable ICU 2.4 1069 * @stable ICU 2.4
1044 */ 1070 */
1045 virtual RegexMatcher &reset(); 1071 virtual RegexMatcher &reset();
1046 1072
(...skipping 801 matching lines...) Expand 10 before | Expand all | Expand 10 after
1848 1874
1849 UErrorCode fDeferredStatus; // Save error state that cannot be im mediately 1875 UErrorCode fDeferredStatus; // Save error state that cannot be im mediately
1850 // reported, or that permanently di sables this matcher. 1876 // reported, or that permanently di sables this matcher.
1851 1877
1852 RuleBasedBreakIterator *fWordBreakItr; 1878 RuleBasedBreakIterator *fWordBreakItr;
1853 }; 1879 };
1854 1880
1855 U_NAMESPACE_END 1881 U_NAMESPACE_END
1856 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 1882 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1857 #endif 1883 #endif
OLDNEW
« no previous file with comments | « source/i18n/unicode/rbnf.h ('k') | source/i18n/unicode/region.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698