icu46/source/i18n/rbt_pars.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/rbt_pars.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 1999-2007, International Business Machines Corporation

	4 * and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 11/17/99 aliu Creation.

	8 **********************************************************************

	9 */

	10 #ifndef RBT_PARS_H

	11 #define RBT_PARS_H

	12

	13 #include "unicode/utypes.h"

	14

	15 #if !UCONFIG_NO_TRANSLITERATION

	16 #ifdef XP_CPLUSPLUS

	17

	18 #include "unicode/uobject.h"

	19 #include "unicode/parseerr.h"

	20 #include "unicode/unorm.h"

	21 #include "rbt.h"

	22 #include "hash.h"

	23 #include "uvector.h"

	24

	25 U_NAMESPACE_BEGIN

	26

	27 class TransliterationRuleData;

	28 class UnicodeFunctor;

	29 class ParseData;

	30 class RuleHalf;

	31 class ParsePosition;

	32 class StringMatcher;

	33

	34 class TransliteratorParser : public UMemory {

	35

	36 public:

	37

	38 /**

	39 * A Vector of TransliterationRuleData objects, one for each discrete group

	40 * of rules in the rule set

	41 */

	42 UVector dataVector;

	43

	44 /**

	45 * PUBLIC data member.

	46 * A Vector of UnicodeStrings containing all of the ID blocks in the rule se t

	47 */

	48 UVector idBlockVector;

	49

	50 /**

	51 * PUBLIC data member containing the parsed compound filter, if any.

	52 */

	53 UnicodeSet* compoundFilter;

	54

	55 private:

	56

	57 /**

	58 * The current data object for which we are parsing rules

	59 */

	60 TransliterationRuleData* curData;

	61

	62 UTransDirection direction;

	63

	64 /**

	65 * Parse error information.

	66 */

	67 UParseError parseError;

	68

	69 /**

	70 * Temporary symbol table used during parsing.

	71 */

	72 ParseData* parseData;

	73

	74 /**

	75 * Temporary vector of matcher variables. When parsing is complete, this

	76 * is copied into the array data.variables. As with data.variables,

	77 * element 0 corresponds to character data.variablesBase.

	78 */

	79 UVector variablesVector;

	80

	81 /**

	82 * Temporary table of variable names. When parsing is complete, this is

	83 * copied into data.variableNames.

	84 */

	85 Hashtable variableNames;

	86

	87 /**

	88 * String of standins for segments. Used during the parsing of a single

	89 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds

	90 * to StringMatcher object segmentObjects.elementAt(0), etc.

	91 */

	92 UnicodeString segmentStandins;

	93

	94 /**

	95 * Vector of StringMatcher objects for segments. Used during the

	96 * parsing of a single rule.

	97 * segmentStandins.charAt(0) is the standin for "$1" and corresponds

	98 * to StringMatcher object segmentObjects.elementAt(0), etc.

	99 */

	100 UVector segmentObjects;

	101

	102 /**

	103 * The next available stand-in for variables. This starts at some point in

	104 * the private use area (discovered dynamically) and increments up toward

	105 * <code>variableLimit</code>. At any point during parsing, available

	106 * variables are <code>variableNext..variableLimit-1</code>.

	107 */

	108 UChar variableNext;

	109

	110 /**

	111 * The last available stand-in for variables. This is discovered

	112 * dynamically. At any point during parsing, available variables are

	113 * <code>variableNext..variableLimit-1</code>.

	114 */

	115 UChar variableLimit;

	116

	117 /**

	118 * When we encounter an undefined variable, we do not immediately signal

	119 * an error, in case we are defining this variable, e.g., "$a = [a-z];".

	120 * Instead, we save the name of the undefined variable, and substitute

	121 * in the placeholder char variableLimit - 1, and decrement

	122 * variableLimit.

	123 */

	124 UnicodeString undefinedVariableName;

	125

	126 /**

	127 * The stand-in character for the 'dot' set, represented by '.' in

	128 * patterns. This is allocated the first time it is needed, and

	129 * reused thereafter.

	130 */

	131 UChar dotStandIn;

	132

	133 public:

	134

	135 /**

	136 * Constructor.

	137 */

	138 TransliteratorParser(UErrorCode &statusReturn);

	139

	140 /**

	141 * Destructor.

	142 */

	143 ~TransliteratorParser();

	144

	145 /**

	146 * Parse the given string as a sequence of rules, separated by newline

	147 * characters ('\n'), and cause this object to implement those rules. Any

	148 * previous rules are discarded. Typically this method is called exactly

	149 * once after construction.

	150 *

	151 * Parse the given rules, in the given direction. After this call

	152 * returns, query the public data members for results. The caller

	153 * owns the 'data' and 'compoundFilter' data members after this

	154 * call returns.

	155 * @param rules rules, separated by ';'

	156 * @param direction either FORWARD or REVERSE.

	157 * @param pe Struct to recieve information on position

	158 * of error if an error is encountered

	159 * @param ec Output param set to success/failure code.

	160 */

	161 void parse(const UnicodeString& rules,

	162 UTransDirection direction,

	163 UParseError& pe,

	164 UErrorCode& ec);

	165

	166 /**

	167 * Return the compound filter parsed by parse(). Caller owns result.

	168 * @return the compound filter parsed by parse().

	169 */

	170 UnicodeSet* orphanCompoundFilter();

	171

	172 private:

	173

	174 /**

	175 * Return a representation of this transliterator as source rules.

	176 * @param rules Output param to receive the rules.

	177 * @param direction either FORWARD or REVERSE.

	178 */

	179 void parseRules(const UnicodeString& rules,

	180 UTransDirection direction,

	181 UErrorCode& status);

	182

	183 /**

	184 * MAIN PARSER. Parse the next rule in the given rule string, starting

	185 * at pos. Return the index after the last character parsed. Do not

	186 * parse characters at or after limit.

	187 *

	188 * Important: The character at pos must be a non-whitespace character

	189 * that is not the comment character.

	190 *

	191 * This method handles quoting, escaping, and whitespace removal. It

	192 * parses the end-of-rule character. It recognizes context and cursor

	193 * indicators. Once it does a lexical breakdown of the rule at pos, it

	194 * creates a rule object and adds it to our rule list.

	195 * @param rules Output param to receive the rules.

	196 * @param pos the starting position.

	197 * @param limit pointer past the last character of the rule.

	198 * @return the index after the last character parsed.

	199 */

	200 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UEr rorCode& status);

	201

	202 /**

	203 * Set the variable range to [start, end] (inclusive).

	204 * @param start the start value of the range.

	205 * @param end the end value of the range.

	206 */

	207 void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

	208

	209 /**

	210 * Assert that the given character is NOT within the variable range.

	211 * If it is, return FALSE. This is neccesary to ensure that the

	212 * variable range does not overlap characters used in a rule.

	213 * @param ch the given character.

	214 * @return True, if the given character is NOT within the variable ran ge.

	215 */

	216 UBool checkVariableRange(UChar32 ch) const;

	217

	218 /**

	219 * Set the maximum backup to 'backup', in response to a pragma

	220 * statement.

	221 * @param backup the new value to be set.

	222 */

	223 void pragmaMaximumBackup(int32_t backup);

	224

	225 /**

	226 * Begin normalizing all rules using the given mode, in response

	227 * to a pragma statement.

	228 * @param mode the given mode.

	229 */

	230 void pragmaNormalizeRules(UNormalizationMode mode);

	231

	232 /**

	233 * Return true if the given rule looks like a pragma.

	234 * @param pos offset to the first non-whitespace character

	235 * of the rule.

	236 * @param limit pointer past the last character of the rule.

	237 * @return true if the given rule looks like a pragma.

	238 */

	239 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

	240

	241 /**

	242 * Parse a pragma. This method assumes resemblesPragma() has

	243 * already returned true.

	244 * @param pos offset to the first non-whitespace character

	245 * of the rule.

	246 * @param limit pointer past the last character of the rule.

	247 * @return the position index after the final ';' of the pragma,

	248 * or -1 on failure.

	249 */

	250 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, U ErrorCode& status);

	251

	252 /**

	253 * Called by main parser upon syntax error. Search the rule string

	254 * for the probable end of the rule. Of course, if the error is that

	255 * the end of rule marker is missing, then the rule end will not be found.

	256 * In any case the rule start will be correctly reported.

	257 * @param parseErrorCode error code.

	258 * @param msg error description.

	259 * @param start position of first character of current rule.

	260 * @return start position of first character of current rule.

	261 */

	262 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,

	263 UErrorCode& status);

	264

	265 /**

	266 * Parse a UnicodeSet out, store it, and return the stand-in character

	267 * used to represent it.

	268 *

	269 * @param rule the rule for UnicodeSet.

	270 * @param pos the position in pattern at which to start parsing.

	271 * @return the stand-in character used to represent it.

	272 */

	273 UChar parseSet(const UnicodeString& rule,

	274 ParsePosition& pos,

	275 UErrorCode& status);

	276

	277 /**

	278 * Generate and return a stand-in for a new UnicodeFunctor. Store

	279 * the matcher (adopt it).

	280 * @param adopted the UnicodeFunctor to be adopted.

	281 * @return a stand-in for a new UnicodeFunctor.

	282 */

	283 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

	284

	285 /**

	286 * Return the standin for segment seg (1-based).

	287 * @param seg the given segment.

	288 * @return the standIn character for the given segment.

	289 */

	290 UChar getSegmentStandin(int32_t seg, UErrorCode& status);

	291

	292 /**

	293 * Set the object for segment seg (1-based).

	294 * @param seg the given segment.

	295 * @param adopted the StringMatcher to be adopted.

	296 */

	297 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& statu s);

	298

	299 /**

	300 * Return the stand-in for the dot set. It is allocated the first

	301 * time and reused thereafter.

	302 * @return the stand-in for the dot set.

	303 */

	304 UChar getDotStandIn(UErrorCode& status);

	305

	306 /**

	307 * Append the value of the given variable name to the given

	308 * UnicodeString.

	309 * @param name the variable name to be appended.

	310 * @param buf the given UnicodeString to append to.

	311 */

	312 void appendVariableDef(const UnicodeString& name,

	313 UnicodeString& buf,

	314 UErrorCode& status);

	315

	316 /**

	317 * Glue method to get around access restrictions in C++.

	318 */

	319 /static Transliterator createBasicInstance(const UnicodeString& id,

	320 const UnicodeString* canonID);*/

	321

	322 friend class RuleHalf;

	323

	324 // Disallowed methods; no impl.

	325 /**

	326 * Copy constructor

	327 */

	328 TransliteratorParser(const TransliteratorParser&);

	329

	330 /**

	331 * Assignment operator

	332 */

	333 TransliteratorParser& operator=(const TransliteratorParser&);

	334 };

	335

	336 U_NAMESPACE_END

	337

	338 #endif /* #ifdef XP_CPLUSPLUS */

	339

	340 /**

	341 * Strip/convert the following from the transliterator rules:

	342 * comments

	343 * newlines

	344 * white space at the beginning and end of a line

	345 * unescape \u notation

	346 *

	347 * The target must be equal in size as the source.

	348 * @internal

	349 */

	350 U_CAPI int32_t

	351 utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorC ode *status);

	352

	353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

	354

	355 #endif

OLD	NEW

« no previous file with comments | « icu46/source/i18n/rbt_data.cpp ('k') | icu46/source/i18n/rbt_pars.cpp » ('j') | no next file with comments »