OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (C) 2001-2004, International Business Machines Corporation |
| 3 * and others. All Rights Reserved. |
| 4 ********************************************************************** |
| 5 * Date Name Description |
| 6 * 07/23/01 aliu Creation. |
| 7 ********************************************************************** |
| 8 */ |
| 9 #ifndef STRMATCH_H |
| 10 #define STRMATCH_H |
| 11 |
| 12 #include "unicode/utypes.h" |
| 13 |
| 14 #if !UCONFIG_NO_TRANSLITERATION |
| 15 |
| 16 #include "unicode/unistr.h" |
| 17 #include "unicode/unifunct.h" |
| 18 #include "unicode/unimatch.h" |
| 19 #include "unicode/unirepl.h" |
| 20 |
| 21 U_NAMESPACE_BEGIN |
| 22 |
| 23 class TransliterationRuleData; |
| 24 |
| 25 /** |
| 26 * An object that matches a fixed input string, implementing the |
| 27 * UnicodeMatcher API. This object also implements the |
| 28 * UnicodeReplacer API, allowing it to emit the matched text as |
| 29 * output. Since the match text may contain flexible match elements, |
| 30 * such as UnicodeSets, the emitted text is not the match pattern, but |
| 31 * instead a substring of the actual matched text. Following |
| 32 * convention, the output text is the leftmost match seen up to this |
| 33 * point. |
| 34 * |
| 35 * A StringMatcher may represent a segment, in which case it has a |
| 36 * positive segment number. This affects how the matcher converts |
| 37 * itself to a pattern but does not otherwise affect its function. |
| 38 * |
| 39 * A StringMatcher that is not a segment should not be used as a |
| 40 * UnicodeReplacer. |
| 41 */ |
| 42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public Unico
deReplacer { |
| 43 |
| 44 public: |
| 45 |
| 46 /** |
| 47 * Construct a matcher that matches the given pattern string. |
| 48 * @param string the pattern to be matched, possibly containing |
| 49 * stand-ins that represent nested UnicodeMatcher objects. |
| 50 * @param start inclusive start index of text to be replaced |
| 51 * @param limit exclusive end index of text to be replaced; |
| 52 * must be greater than or equal to start |
| 53 * @param segmentNum the segment number from 1..n, or 0 if this is |
| 54 * not a segment. |
| 55 * @param data context object mapping stand-ins to |
| 56 * UnicodeMatcher objects. |
| 57 */ |
| 58 StringMatcher(const UnicodeString& string, |
| 59 int32_t start, |
| 60 int32_t limit, |
| 61 int32_t segmentNum, |
| 62 const TransliterationRuleData& data); |
| 63 |
| 64 /** |
| 65 * Copy constructor |
| 66 * @param o the object to be copied. |
| 67 */ |
| 68 StringMatcher(const StringMatcher& o); |
| 69 |
| 70 /** |
| 71 * Destructor |
| 72 */ |
| 73 virtual ~StringMatcher(); |
| 74 |
| 75 /** |
| 76 * Implement UnicodeFunctor |
| 77 * @return a copy of the object. |
| 78 */ |
| 79 virtual UnicodeFunctor* clone() const; |
| 80 |
| 81 /** |
| 82 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer |
| 83 * and return the pointer. |
| 84 * @return the UnicodeMatcher point. |
| 85 */ |
| 86 virtual UnicodeMatcher* toMatcher() const; |
| 87 |
| 88 /** |
| 89 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer |
| 90 * and return the pointer. |
| 91 * @return the UnicodeReplacer pointer. |
| 92 */ |
| 93 virtual UnicodeReplacer* toReplacer() const; |
| 94 |
| 95 /** |
| 96 * Implement UnicodeMatcher |
| 97 * @param text the text to be matched |
| 98 * @param offset on input, the index into text at which to begin |
| 99 * matching. On output, the limit of the matched text. The |
| 100 * number of matched characters is the output value of offset |
| 101 * minus the input value. Offset should always point to the |
| 102 * HIGH SURROGATE (leading code unit) of a pair of surrogates, |
| 103 * both on entry and upon return. |
| 104 * @param limit the limit index of text to be matched. Greater |
| 105 * than offset for a forward direction match, less than offset for |
| 106 * a backward direction match. The last character to be |
| 107 * considered for matching will be text.charAt(limit-1) in the |
| 108 * forward direction or text.charAt(limit+1) in the backward |
| 109 * direction. |
| 110 * @param incremental if TRUE, then assume further characters may |
| 111 * be inserted at limit and check for partial matching. Otherwise |
| 112 * assume the text as given is complete. |
| 113 * @return a match degree value indicating a full match, a partial |
| 114 * match, or a mismatch. If incremental is FALSE then |
| 115 * U_PARTIAL_MATCH should never be returned. |
| 116 */ |
| 117 virtual UMatchDegree matches(const Replaceable& text, |
| 118 int32_t& offset, |
| 119 int32_t limit, |
| 120 UBool incremental); |
| 121 |
| 122 /** |
| 123 * Implement UnicodeMatcher |
| 124 * @param result Output param to receive the pattern. |
| 125 * @param escapeUnprintable if True then escape the unprintable characters. |
| 126 * @return A reference to 'result'. |
| 127 */ |
| 128 virtual UnicodeString& toPattern(UnicodeString& result, |
| 129 UBool escapeUnprintable = FALSE) const; |
| 130 |
| 131 /** |
| 132 * Implement UnicodeMatcher |
| 133 * Returns TRUE if this matcher will match a character c, where c |
| 134 * & 0xFF == v, at offset, in the forward direction (with limit > |
| 135 * offset). This is used by <tt>RuleBasedTransliterator</tt> for |
| 136 * indexing. |
| 137 * @param v the given value |
| 138 * @return TRUE if this matcher will match a character c, |
| 139 * where c & 0xFF == v |
| 140 */ |
| 141 virtual UBool matchesIndexValue(uint8_t v) const; |
| 142 |
| 143 /** |
| 144 * Implement UnicodeMatcher |
| 145 */ |
| 146 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; |
| 147 |
| 148 /** |
| 149 * Implement UnicodeFunctor |
| 150 */ |
| 151 virtual void setData(const TransliterationRuleData*); |
| 152 |
| 153 /** |
| 154 * Replace characters in 'text' from 'start' to 'limit' with the |
| 155 * output text of this object. Update the 'cursor' parameter to |
| 156 * give the cursor position and return the length of the |
| 157 * replacement text. |
| 158 * |
| 159 * @param text the text to be matched |
| 160 * @param start inclusive start index of text to be replaced |
| 161 * @param limit exclusive end index of text to be replaced; |
| 162 * must be greater than or equal to start |
| 163 * @param cursor output parameter for the cursor position. |
| 164 * Not all replacer objects will update this, but in a complete |
| 165 * tree of replacer objects, representing the entire output side |
| 166 * of a transliteration rule, at least one must update it. |
| 167 * @return the number of 16-bit code units in the text replacing |
| 168 * the characters at offsets start..(limit-1) in text |
| 169 */ |
| 170 virtual int32_t replace(Replaceable& text, |
| 171 int32_t start, |
| 172 int32_t limit, |
| 173 int32_t& cursor); |
| 174 |
| 175 /** |
| 176 * Returns a string representation of this replacer. If the |
| 177 * result of calling this function is passed to the appropriate |
| 178 * parser, typically TransliteratorParser, it will produce another |
| 179 * replacer that is equal to this one. |
| 180 * @param result the string to receive the pattern. Previous |
| 181 * contents will be deleted. |
| 182 * @param escapeUnprintable if TRUE then convert unprintable |
| 183 * character to their hex escape representations, \\uxxxx or |
| 184 * \\Uxxxxxxxx. Unprintable characters are defined by |
| 185 * Utility.isUnprintable(). |
| 186 * @return a reference to 'result'. |
| 187 */ |
| 188 virtual UnicodeString& toReplacerPattern(UnicodeString& result, |
| 189 UBool escapeUnprintable) const; |
| 190 |
| 191 /** |
| 192 * Remove any match data. This must be called before performing a |
| 193 * set of matches with this segment. |
| 194 */ |
| 195 void resetMatch(); |
| 196 |
| 197 /** |
| 198 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
| 199 * |
| 200 * @draft ICU 2.2 |
| 201 */ |
| 202 virtual UClassID getDynamicClassID() const; |
| 203 |
| 204 /** |
| 205 * ICU "poor man's RTTI", returns a UClassID for this class. |
| 206 * |
| 207 * @draft ICU 2.2 |
| 208 */ |
| 209 static UClassID U_EXPORT2 getStaticClassID(); |
| 210 |
| 211 /** |
| 212 * Union the set of all characters that may output by this object |
| 213 * into the given set. |
| 214 * @param toUnionTo the set into which to union the output characters |
| 215 */ |
| 216 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; |
| 217 |
| 218 private: |
| 219 |
| 220 /** |
| 221 * The text to be matched. |
| 222 */ |
| 223 UnicodeString pattern; |
| 224 |
| 225 /** |
| 226 * Context object that maps stand-ins to matcher and replacer |
| 227 * objects. |
| 228 */ |
| 229 const TransliterationRuleData* data; |
| 230 |
| 231 /** |
| 232 * The segment number, 1-based, or 0 if not a segment. |
| 233 */ |
| 234 int32_t segmentNumber; |
| 235 |
| 236 /** |
| 237 * Start offset, in the match text, of the <em>rightmost</em> |
| 238 * match. |
| 239 */ |
| 240 int32_t matchStart; |
| 241 |
| 242 /** |
| 243 * Limit offset, in the match text, of the <em>rightmost</em> |
| 244 * match. |
| 245 */ |
| 246 int32_t matchLimit; |
| 247 |
| 248 }; |
| 249 |
| 250 U_NAMESPACE_END |
| 251 |
| 252 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| 253 |
| 254 #endif |
OLD | NEW |