icu46/source/i18n/rbt_pars.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/rbt_pars.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/rbt_pars.h

===================================================================

--- icu46/source/i18n/rbt_pars.h (revision 0)

+++ icu46/source/i18n/rbt_pars.h (revision 0)

@@ -0,0 +1,355 @@

+/*

+**********************************************************************

+* Date Name Description

+* 11/17/99 aliu Creation.

+**********************************************************************

+*/

+#ifndef RBT_PARS_H

+#define RBT_PARS_H

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_TRANSLITERATION

+#ifdef XP_CPLUSPLUS

+#include "unicode/uobject.h"

+#include "unicode/parseerr.h"

+#include "unicode/unorm.h"

+#include "rbt.h"

+#include "hash.h"

+#include "uvector.h"

+U_NAMESPACE_BEGIN

+class TransliterationRuleData;

+class UnicodeFunctor;

+class ParseData;

+class RuleHalf;

+class ParsePosition;

+class StringMatcher;

+class TransliteratorParser : public UMemory {

+ public:

+ /**

+ * A Vector of TransliterationRuleData objects, one for each discrete group

+ * of rules in the rule set

+ */

+ UVector dataVector;

+ /**

+ * PUBLIC data member.

+ * A Vector of UnicodeStrings containing all of the ID blocks in the rule set

+ */

+ UVector idBlockVector;

+ /**

+ * PUBLIC data member containing the parsed compound filter, if any.

+ */

+ UnicodeSet* compoundFilter;

+ private:

+ /**

+ * The current data object for which we are parsing rules

+ */

+ TransliterationRuleData* curData;

+ UTransDirection direction;

+ /**

+ * Parse error information.

+ */

+ UParseError parseError;

+ /**

+ * Temporary symbol table used during parsing.

+ */

+ ParseData* parseData;

+ /**

+ * Temporary vector of matcher variables. When parsing is complete, this

+ * is copied into the array data.variables. As with data.variables,

+ * element 0 corresponds to character data.variablesBase.

+ */

+ UVector variablesVector;

+ /**

+ * Temporary table of variable names. When parsing is complete, this is

+ * copied into data.variableNames.

+ */

+ Hashtable variableNames;

+ /**

+ * String of standins for segments. Used during the parsing of a single

+ * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds

+ * to StringMatcher object segmentObjects.elementAt(0), etc.

+ */

+ UnicodeString segmentStandins;

+ /**

+ * Vector of StringMatcher objects for segments. Used during the

+ * parsing of a single rule.

+ * segmentStandins.charAt(0) is the standin for "$1" and corresponds

+ * to StringMatcher object segmentObjects.elementAt(0), etc.

+ */

+ UVector segmentObjects;

+ /**

+ * The next available stand-in for variables. This starts at some point in

+ * the private use area (discovered dynamically) and increments up toward

+ * <code>variableLimit</code>. At any point during parsing, available

+ * variables are <code>variableNext..variableLimit-1</code>.

+ */

+ UChar variableNext;

+ /**

+ * The last available stand-in for variables. This is discovered

+ * dynamically. At any point during parsing, available variables are

+ * <code>variableNext..variableLimit-1</code>.

+ */

+ UChar variableLimit;

+ /**

+ * When we encounter an undefined variable, we do not immediately signal

+ * an error, in case we are defining this variable, e.g., "$a = [a-z];".

+ * Instead, we save the name of the undefined variable, and substitute

+ * in the placeholder char variableLimit - 1, and decrement

+ * variableLimit.

+ */

+ UnicodeString undefinedVariableName;

+ /**

+ * The stand-in character for the 'dot' set, represented by '.' in

+ * patterns. This is allocated the first time it is needed, and

+ * reused thereafter.

+ */

+ UChar dotStandIn;

+public:

+ /**

+ * Constructor.

+ */

+ TransliteratorParser(UErrorCode &statusReturn);

+ /**

+ * Destructor.

+ */

+ ~TransliteratorParser();

+ /**

+ * Parse the given string as a sequence of rules, separated by newline

+ * characters ('\n'), and cause this object to implement those rules. Any

+ * previous rules are discarded. Typically this method is called exactly

+ * once after construction.

+ *

+ * Parse the given rules, in the given direction. After this call

+ * returns, query the public data members for results. The caller

+ * owns the 'data' and 'compoundFilter' data members after this

+ * call returns.

+ * @param rules rules, separated by ';'

+ * @param direction either FORWARD or REVERSE.

+ * @param pe Struct to recieve information on position

+ * of error if an error is encountered

+ * @param ec Output param set to success/failure code.

+ */

+ void parse(const UnicodeString& rules,

+ UTransDirection direction,

+ UParseError& pe,

+ UErrorCode& ec);

+ /**

+ * Return the compound filter parsed by parse(). Caller owns result.

+ * @return the compound filter parsed by parse().

+ */

+ UnicodeSet* orphanCompoundFilter();

+private:

+ /**

+ * Return a representation of this transliterator as source rules.

+ * @param rules Output param to receive the rules.

+ * @param direction either FORWARD or REVERSE.

+ */

+ void parseRules(const UnicodeString& rules,

+ UTransDirection direction,

+ UErrorCode& status);

+ /**

+ * MAIN PARSER. Parse the next rule in the given rule string, starting

+ * at pos. Return the index after the last character parsed. Do not

+ * parse characters at or after limit.

+ *

+ * Important: The character at pos must be a non-whitespace character

+ * that is not the comment character.

+ *

+ * This method handles quoting, escaping, and whitespace removal. It

+ * parses the end-of-rule character. It recognizes context and cursor

+ * indicators. Once it does a lexical breakdown of the rule at pos, it

+ * creates a rule object and adds it to our rule list.

+ * @param rules Output param to receive the rules.

+ * @param pos the starting position.

+ * @param limit pointer past the last character of the rule.

+ * @return the index after the last character parsed.

+ */

+ int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

+ /**

+ * Set the variable range to [start, end] (inclusive).

+ * @param start the start value of the range.

+ * @param end the end value of the range.

+ */

+ void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

+ /**

+ * Assert that the given character is NOT within the variable range.

+ * If it is, return FALSE. This is neccesary to ensure that the

+ * variable range does not overlap characters used in a rule.

+ * @param ch the given character.

+ * @return True, if the given character is NOT within the variable range.

+ */

+ UBool checkVariableRange(UChar32 ch) const;

+ /**

+ * Set the maximum backup to 'backup', in response to a pragma

+ * statement.

+ * @param backup the new value to be set.

+ */

+ void pragmaMaximumBackup(int32_t backup);

+ /**

+ * Begin normalizing all rules using the given mode, in response

+ * to a pragma statement.

+ * @param mode the given mode.

+ */

+ void pragmaNormalizeRules(UNormalizationMode mode);

+ /**

+ * Return true if the given rule looks like a pragma.

+ * @param pos offset to the first non-whitespace character

+ * of the rule.

+ * @param limit pointer past the last character of the rule.

+ * @return true if the given rule looks like a pragma.

+ */

+ static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

+ /**

+ * Parse a pragma. This method assumes resemblesPragma() has

+ * already returned true.

+ * @param pos offset to the first non-whitespace character

+ * of the rule.

+ * @param limit pointer past the last character of the rule.

+ * @return the position index after the final ';' of the pragma,

+ * or -1 on failure.

+ */

+ int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

+ /**

+ * Called by main parser upon syntax error. Search the rule string

+ * for the probable end of the rule. Of course, if the error is that

+ * the end of rule marker is missing, then the rule end will not be found.

+ * In any case the rule start will be correctly reported.

+ * @param parseErrorCode error code.

+ * @param msg error description.

+ * @param start position of first character of current rule.

+ * @return start position of first character of current rule.

+ */

+ int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,

+ UErrorCode& status);

+ /**

+ * Parse a UnicodeSet out, store it, and return the stand-in character

+ * used to represent it.

+ *

+ * @param rule the rule for UnicodeSet.

+ * @param pos the position in pattern at which to start parsing.

+ * @return the stand-in character used to represent it.

+ */

+ UChar parseSet(const UnicodeString& rule,

+ ParsePosition& pos,

+ UErrorCode& status);

+ /**

+ * Generate and return a stand-in for a new UnicodeFunctor. Store

+ * the matcher (adopt it).

+ * @param adopted the UnicodeFunctor to be adopted.

+ * @return a stand-in for a new UnicodeFunctor.

+ */

+ UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

+ /**

+ * Return the standin for segment seg (1-based).

+ * @param seg the given segment.

+ * @return the standIn character for the given segment.

+ */

+ UChar getSegmentStandin(int32_t seg, UErrorCode& status);

+ /**

+ * Set the object for segment seg (1-based).

+ * @param seg the given segment.

+ * @param adopted the StringMatcher to be adopted.

+ */

+ void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

+ /**

+ * Return the stand-in for the dot set. It is allocated the first

+ * time and reused thereafter.

+ * @return the stand-in for the dot set.

+ */

+ UChar getDotStandIn(UErrorCode& status);

+ /**

+ * Append the value of the given variable name to the given

+ * UnicodeString.

+ * @param name the variable name to be appended.

+ * @param buf the given UnicodeString to append to.

+ */

+ void appendVariableDef(const UnicodeString& name,

+ UnicodeString& buf,

+ UErrorCode& status);

+ /**

+ * Glue method to get around access restrictions in C++.

+ */

+ /*static Transliterator* createBasicInstance(const UnicodeString& id,

+ const UnicodeString* canonID);*/

+ friend class RuleHalf;

+ // Disallowed methods; no impl.

+ /**

+ * Copy constructor

+ */

+ TransliteratorParser(const TransliteratorParser&);

+ /**

+ * Assignment operator

+ */

+ TransliteratorParser& operator=(const TransliteratorParser&);

+};

+U_NAMESPACE_END

+#endif /* #ifdef XP_CPLUSPLUS */

+/**

+ * Strip/convert the following from the transliterator rules:

+ * comments

+ * newlines

+ * white space at the beginning and end of a line

+ * unescape \u notation

+ *

+ * The target must be equal in size as the source.

+ * @internal

+ */

+U_CAPI int32_t

+utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

+#endif /* #if !UCONFIG_NO_TRANSLITERATION */

+#endif

Property changes on: icu46/source/i18n/rbt_pars.h

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/rbt_data.cpp ('k') | icu46/source/i18n/rbt_pars.cpp » ('j') | no next file with comments »