OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 1999-2007, International Business Machines Corporation |
| 4 * and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * Date Name Description |
| 7 * 11/17/99 aliu Creation. |
| 8 ********************************************************************** |
| 9 */ |
| 10 #ifndef RBT_PARS_H |
| 11 #define RBT_PARS_H |
| 12 |
| 13 #include "unicode/utypes.h" |
| 14 |
| 15 #if !UCONFIG_NO_TRANSLITERATION |
| 16 #ifdef XP_CPLUSPLUS |
| 17 |
| 18 #include "unicode/uobject.h" |
| 19 #include "unicode/parseerr.h" |
| 20 #include "unicode/unorm.h" |
| 21 #include "rbt.h" |
| 22 #include "hash.h" |
| 23 #include "uvector.h" |
| 24 |
| 25 U_NAMESPACE_BEGIN |
| 26 |
| 27 class TransliterationRuleData; |
| 28 class UnicodeFunctor; |
| 29 class ParseData; |
| 30 class RuleHalf; |
| 31 class ParsePosition; |
| 32 class StringMatcher; |
| 33 |
| 34 class TransliteratorParser : public UMemory { |
| 35 |
| 36 public: |
| 37 |
| 38 /** |
| 39 * A Vector of TransliterationRuleData objects, one for each discrete group |
| 40 * of rules in the rule set |
| 41 */ |
| 42 UVector dataVector; |
| 43 |
| 44 /** |
| 45 * PUBLIC data member. |
| 46 * A Vector of UnicodeStrings containing all of the ID blocks in the rule se
t |
| 47 */ |
| 48 UVector idBlockVector; |
| 49 |
| 50 /** |
| 51 * PUBLIC data member containing the parsed compound filter, if any. |
| 52 */ |
| 53 UnicodeSet* compoundFilter; |
| 54 |
| 55 private: |
| 56 |
| 57 /** |
| 58 * The current data object for which we are parsing rules |
| 59 */ |
| 60 TransliterationRuleData* curData; |
| 61 |
| 62 UTransDirection direction; |
| 63 |
| 64 /** |
| 65 * Parse error information. |
| 66 */ |
| 67 UParseError parseError; |
| 68 |
| 69 /** |
| 70 * Temporary symbol table used during parsing. |
| 71 */ |
| 72 ParseData* parseData; |
| 73 |
| 74 /** |
| 75 * Temporary vector of matcher variables. When parsing is complete, this |
| 76 * is copied into the array data.variables. As with data.variables, |
| 77 * element 0 corresponds to character data.variablesBase. |
| 78 */ |
| 79 UVector variablesVector; |
| 80 |
| 81 /** |
| 82 * Temporary table of variable names. When parsing is complete, this is |
| 83 * copied into data.variableNames. |
| 84 */ |
| 85 Hashtable variableNames; |
| 86 |
| 87 /** |
| 88 * String of standins for segments. Used during the parsing of a single |
| 89 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds |
| 90 * to StringMatcher object segmentObjects.elementAt(0), etc. |
| 91 */ |
| 92 UnicodeString segmentStandins; |
| 93 |
| 94 /** |
| 95 * Vector of StringMatcher objects for segments. Used during the |
| 96 * parsing of a single rule. |
| 97 * segmentStandins.charAt(0) is the standin for "$1" and corresponds |
| 98 * to StringMatcher object segmentObjects.elementAt(0), etc. |
| 99 */ |
| 100 UVector segmentObjects; |
| 101 |
| 102 /** |
| 103 * The next available stand-in for variables. This starts at some point in |
| 104 * the private use area (discovered dynamically) and increments up toward |
| 105 * <code>variableLimit</code>. At any point during parsing, available |
| 106 * variables are <code>variableNext..variableLimit-1</code>. |
| 107 */ |
| 108 UChar variableNext; |
| 109 |
| 110 /** |
| 111 * The last available stand-in for variables. This is discovered |
| 112 * dynamically. At any point during parsing, available variables are |
| 113 * <code>variableNext..variableLimit-1</code>. |
| 114 */ |
| 115 UChar variableLimit; |
| 116 |
| 117 /** |
| 118 * When we encounter an undefined variable, we do not immediately signal |
| 119 * an error, in case we are defining this variable, e.g., "$a = [a-z];". |
| 120 * Instead, we save the name of the undefined variable, and substitute |
| 121 * in the placeholder char variableLimit - 1, and decrement |
| 122 * variableLimit. |
| 123 */ |
| 124 UnicodeString undefinedVariableName; |
| 125 |
| 126 /** |
| 127 * The stand-in character for the 'dot' set, represented by '.' in |
| 128 * patterns. This is allocated the first time it is needed, and |
| 129 * reused thereafter. |
| 130 */ |
| 131 UChar dotStandIn; |
| 132 |
| 133 public: |
| 134 |
| 135 /** |
| 136 * Constructor. |
| 137 */ |
| 138 TransliteratorParser(UErrorCode &statusReturn); |
| 139 |
| 140 /** |
| 141 * Destructor. |
| 142 */ |
| 143 ~TransliteratorParser(); |
| 144 |
| 145 /** |
| 146 * Parse the given string as a sequence of rules, separated by newline |
| 147 * characters ('\n'), and cause this object to implement those rules. Any |
| 148 * previous rules are discarded. Typically this method is called exactly |
| 149 * once after construction. |
| 150 * |
| 151 * Parse the given rules, in the given direction. After this call |
| 152 * returns, query the public data members for results. The caller |
| 153 * owns the 'data' and 'compoundFilter' data members after this |
| 154 * call returns. |
| 155 * @param rules rules, separated by ';' |
| 156 * @param direction either FORWARD or REVERSE. |
| 157 * @param pe Struct to recieve information on position |
| 158 * of error if an error is encountered |
| 159 * @param ec Output param set to success/failure code. |
| 160 */ |
| 161 void parse(const UnicodeString& rules, |
| 162 UTransDirection direction, |
| 163 UParseError& pe, |
| 164 UErrorCode& ec); |
| 165 |
| 166 /** |
| 167 * Return the compound filter parsed by parse(). Caller owns result. |
| 168 * @return the compound filter parsed by parse(). |
| 169 */ |
| 170 UnicodeSet* orphanCompoundFilter(); |
| 171 |
| 172 private: |
| 173 |
| 174 /** |
| 175 * Return a representation of this transliterator as source rules. |
| 176 * @param rules Output param to receive the rules. |
| 177 * @param direction either FORWARD or REVERSE. |
| 178 */ |
| 179 void parseRules(const UnicodeString& rules, |
| 180 UTransDirection direction, |
| 181 UErrorCode& status); |
| 182 |
| 183 /** |
| 184 * MAIN PARSER. Parse the next rule in the given rule string, starting |
| 185 * at pos. Return the index after the last character parsed. Do not |
| 186 * parse characters at or after limit. |
| 187 * |
| 188 * Important: The character at pos must be a non-whitespace character |
| 189 * that is not the comment character. |
| 190 * |
| 191 * This method handles quoting, escaping, and whitespace removal. It |
| 192 * parses the end-of-rule character. It recognizes context and cursor |
| 193 * indicators. Once it does a lexical breakdown of the rule at pos, it |
| 194 * creates a rule object and adds it to our rule list. |
| 195 * @param rules Output param to receive the rules. |
| 196 * @param pos the starting position. |
| 197 * @param limit pointer past the last character of the rule. |
| 198 * @return the index after the last character parsed. |
| 199 */ |
| 200 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UEr
rorCode& status); |
| 201 |
| 202 /** |
| 203 * Set the variable range to [start, end] (inclusive). |
| 204 * @param start the start value of the range. |
| 205 * @param end the end value of the range. |
| 206 */ |
| 207 void setVariableRange(int32_t start, int32_t end, UErrorCode& status); |
| 208 |
| 209 /** |
| 210 * Assert that the given character is NOT within the variable range. |
| 211 * If it is, return FALSE. This is neccesary to ensure that the |
| 212 * variable range does not overlap characters used in a rule. |
| 213 * @param ch the given character. |
| 214 * @return True, if the given character is NOT within the variable ran
ge. |
| 215 */ |
| 216 UBool checkVariableRange(UChar32 ch) const; |
| 217 |
| 218 /** |
| 219 * Set the maximum backup to 'backup', in response to a pragma |
| 220 * statement. |
| 221 * @param backup the new value to be set. |
| 222 */ |
| 223 void pragmaMaximumBackup(int32_t backup); |
| 224 |
| 225 /** |
| 226 * Begin normalizing all rules using the given mode, in response |
| 227 * to a pragma statement. |
| 228 * @param mode the given mode. |
| 229 */ |
| 230 void pragmaNormalizeRules(UNormalizationMode mode); |
| 231 |
| 232 /** |
| 233 * Return true if the given rule looks like a pragma. |
| 234 * @param pos offset to the first non-whitespace character |
| 235 * of the rule. |
| 236 * @param limit pointer past the last character of the rule. |
| 237 * @return true if the given rule looks like a pragma. |
| 238 */ |
| 239 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t
limit); |
| 240 |
| 241 /** |
| 242 * Parse a pragma. This method assumes resemblesPragma() has |
| 243 * already returned true. |
| 244 * @param pos offset to the first non-whitespace character |
| 245 * of the rule. |
| 246 * @param limit pointer past the last character of the rule. |
| 247 * @return the position index after the final ';' of the pragma, |
| 248 * or -1 on failure. |
| 249 */ |
| 250 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, U
ErrorCode& status); |
| 251 |
| 252 /** |
| 253 * Called by main parser upon syntax error. Search the rule string |
| 254 * for the probable end of the rule. Of course, if the error is that |
| 255 * the end of rule marker is missing, then the rule end will not be found. |
| 256 * In any case the rule start will be correctly reported. |
| 257 * @param parseErrorCode error code. |
| 258 * @param msg error description. |
| 259 * @param start position of first character of current rule. |
| 260 * @return start position of first character of current rule. |
| 261 */ |
| 262 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t
start, |
| 263 UErrorCode& status); |
| 264 |
| 265 /** |
| 266 * Parse a UnicodeSet out, store it, and return the stand-in character |
| 267 * used to represent it. |
| 268 * |
| 269 * @param rule the rule for UnicodeSet. |
| 270 * @param pos the position in pattern at which to start parsing. |
| 271 * @return the stand-in character used to represent it. |
| 272 */ |
| 273 UChar parseSet(const UnicodeString& rule, |
| 274 ParsePosition& pos, |
| 275 UErrorCode& status); |
| 276 |
| 277 /** |
| 278 * Generate and return a stand-in for a new UnicodeFunctor. Store |
| 279 * the matcher (adopt it). |
| 280 * @param adopted the UnicodeFunctor to be adopted. |
| 281 * @return a stand-in for a new UnicodeFunctor. |
| 282 */ |
| 283 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); |
| 284 |
| 285 /** |
| 286 * Return the standin for segment seg (1-based). |
| 287 * @param seg the given segment. |
| 288 * @return the standIn character for the given segment. |
| 289 */ |
| 290 UChar getSegmentStandin(int32_t seg, UErrorCode& status); |
| 291 |
| 292 /** |
| 293 * Set the object for segment seg (1-based). |
| 294 * @param seg the given segment. |
| 295 * @param adopted the StringMatcher to be adopted. |
| 296 */ |
| 297 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& statu
s); |
| 298 |
| 299 /** |
| 300 * Return the stand-in for the dot set. It is allocated the first |
| 301 * time and reused thereafter. |
| 302 * @return the stand-in for the dot set. |
| 303 */ |
| 304 UChar getDotStandIn(UErrorCode& status); |
| 305 |
| 306 /** |
| 307 * Append the value of the given variable name to the given |
| 308 * UnicodeString. |
| 309 * @param name the variable name to be appended. |
| 310 * @param buf the given UnicodeString to append to. |
| 311 */ |
| 312 void appendVariableDef(const UnicodeString& name, |
| 313 UnicodeString& buf, |
| 314 UErrorCode& status); |
| 315 |
| 316 /** |
| 317 * Glue method to get around access restrictions in C++. |
| 318 */ |
| 319 /*static Transliterator* createBasicInstance(const UnicodeString& id, |
| 320 const UnicodeString* canonID);*/ |
| 321 |
| 322 friend class RuleHalf; |
| 323 |
| 324 // Disallowed methods; no impl. |
| 325 /** |
| 326 * Copy constructor |
| 327 */ |
| 328 TransliteratorParser(const TransliteratorParser&); |
| 329 |
| 330 /** |
| 331 * Assignment operator |
| 332 */ |
| 333 TransliteratorParser& operator=(const TransliteratorParser&); |
| 334 }; |
| 335 |
| 336 U_NAMESPACE_END |
| 337 |
| 338 #endif /* #ifdef XP_CPLUSPLUS */ |
| 339 |
| 340 /** |
| 341 * Strip/convert the following from the transliterator rules: |
| 342 * comments |
| 343 * newlines |
| 344 * white space at the beginning and end of a line |
| 345 * unescape \u notation |
| 346 * |
| 347 * The target must be equal in size as the source. |
| 348 * @internal |
| 349 */ |
| 350 U_CAPI int32_t |
| 351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorC
ode *status); |
| 352 |
| 353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| 354 |
| 355 #endif |
OLD | NEW |