| Index: icu46/source/i18n/rbt_pars.h
|
| ===================================================================
|
| --- icu46/source/i18n/rbt_pars.h (revision 0)
|
| +++ icu46/source/i18n/rbt_pars.h (revision 0)
|
| @@ -0,0 +1,355 @@
|
| +/*
|
| +**********************************************************************
|
| +* Copyright (C) 1999-2007, International Business Machines Corporation
|
| +* and others. All Rights Reserved.
|
| +**********************************************************************
|
| +* Date Name Description
|
| +* 11/17/99 aliu Creation.
|
| +**********************************************************************
|
| +*/
|
| +#ifndef RBT_PARS_H
|
| +#define RBT_PARS_H
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_TRANSLITERATION
|
| +#ifdef XP_CPLUSPLUS
|
| +
|
| +#include "unicode/uobject.h"
|
| +#include "unicode/parseerr.h"
|
| +#include "unicode/unorm.h"
|
| +#include "rbt.h"
|
| +#include "hash.h"
|
| +#include "uvector.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +class TransliterationRuleData;
|
| +class UnicodeFunctor;
|
| +class ParseData;
|
| +class RuleHalf;
|
| +class ParsePosition;
|
| +class StringMatcher;
|
| +
|
| +class TransliteratorParser : public UMemory {
|
| +
|
| + public:
|
| +
|
| + /**
|
| + * A Vector of TransliterationRuleData objects, one for each discrete group
|
| + * of rules in the rule set
|
| + */
|
| + UVector dataVector;
|
| +
|
| + /**
|
| + * PUBLIC data member.
|
| + * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
|
| + */
|
| + UVector idBlockVector;
|
| +
|
| + /**
|
| + * PUBLIC data member containing the parsed compound filter, if any.
|
| + */
|
| + UnicodeSet* compoundFilter;
|
| +
|
| + private:
|
| +
|
| + /**
|
| + * The current data object for which we are parsing rules
|
| + */
|
| + TransliterationRuleData* curData;
|
| +
|
| + UTransDirection direction;
|
| +
|
| + /**
|
| + * Parse error information.
|
| + */
|
| + UParseError parseError;
|
| +
|
| + /**
|
| + * Temporary symbol table used during parsing.
|
| + */
|
| + ParseData* parseData;
|
| +
|
| + /**
|
| + * Temporary vector of matcher variables. When parsing is complete, this
|
| + * is copied into the array data.variables. As with data.variables,
|
| + * element 0 corresponds to character data.variablesBase.
|
| + */
|
| + UVector variablesVector;
|
| +
|
| + /**
|
| + * Temporary table of variable names. When parsing is complete, this is
|
| + * copied into data.variableNames.
|
| + */
|
| + Hashtable variableNames;
|
| +
|
| + /**
|
| + * String of standins for segments. Used during the parsing of a single
|
| + * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
|
| + * to StringMatcher object segmentObjects.elementAt(0), etc.
|
| + */
|
| + UnicodeString segmentStandins;
|
| +
|
| + /**
|
| + * Vector of StringMatcher objects for segments. Used during the
|
| + * parsing of a single rule.
|
| + * segmentStandins.charAt(0) is the standin for "$1" and corresponds
|
| + * to StringMatcher object segmentObjects.elementAt(0), etc.
|
| + */
|
| + UVector segmentObjects;
|
| +
|
| + /**
|
| + * The next available stand-in for variables. This starts at some point in
|
| + * the private use area (discovered dynamically) and increments up toward
|
| + * <code>variableLimit</code>. At any point during parsing, available
|
| + * variables are <code>variableNext..variableLimit-1</code>.
|
| + */
|
| + UChar variableNext;
|
| +
|
| + /**
|
| + * The last available stand-in for variables. This is discovered
|
| + * dynamically. At any point during parsing, available variables are
|
| + * <code>variableNext..variableLimit-1</code>.
|
| + */
|
| + UChar variableLimit;
|
| +
|
| + /**
|
| + * When we encounter an undefined variable, we do not immediately signal
|
| + * an error, in case we are defining this variable, e.g., "$a = [a-z];".
|
| + * Instead, we save the name of the undefined variable, and substitute
|
| + * in the placeholder char variableLimit - 1, and decrement
|
| + * variableLimit.
|
| + */
|
| + UnicodeString undefinedVariableName;
|
| +
|
| + /**
|
| + * The stand-in character for the 'dot' set, represented by '.' in
|
| + * patterns. This is allocated the first time it is needed, and
|
| + * reused thereafter.
|
| + */
|
| + UChar dotStandIn;
|
| +
|
| +public:
|
| +
|
| + /**
|
| + * Constructor.
|
| + */
|
| + TransliteratorParser(UErrorCode &statusReturn);
|
| +
|
| + /**
|
| + * Destructor.
|
| + */
|
| + ~TransliteratorParser();
|
| +
|
| + /**
|
| + * Parse the given string as a sequence of rules, separated by newline
|
| + * characters ('\n'), and cause this object to implement those rules. Any
|
| + * previous rules are discarded. Typically this method is called exactly
|
| + * once after construction.
|
| + *
|
| + * Parse the given rules, in the given direction. After this call
|
| + * returns, query the public data members for results. The caller
|
| + * owns the 'data' and 'compoundFilter' data members after this
|
| + * call returns.
|
| + * @param rules rules, separated by ';'
|
| + * @param direction either FORWARD or REVERSE.
|
| + * @param pe Struct to recieve information on position
|
| + * of error if an error is encountered
|
| + * @param ec Output param set to success/failure code.
|
| + */
|
| + void parse(const UnicodeString& rules,
|
| + UTransDirection direction,
|
| + UParseError& pe,
|
| + UErrorCode& ec);
|
| +
|
| + /**
|
| + * Return the compound filter parsed by parse(). Caller owns result.
|
| + * @return the compound filter parsed by parse().
|
| + */
|
| + UnicodeSet* orphanCompoundFilter();
|
| +
|
| +private:
|
| +
|
| + /**
|
| + * Return a representation of this transliterator as source rules.
|
| + * @param rules Output param to receive the rules.
|
| + * @param direction either FORWARD or REVERSE.
|
| + */
|
| + void parseRules(const UnicodeString& rules,
|
| + UTransDirection direction,
|
| + UErrorCode& status);
|
| +
|
| + /**
|
| + * MAIN PARSER. Parse the next rule in the given rule string, starting
|
| + * at pos. Return the index after the last character parsed. Do not
|
| + * parse characters at or after limit.
|
| + *
|
| + * Important: The character at pos must be a non-whitespace character
|
| + * that is not the comment character.
|
| + *
|
| + * This method handles quoting, escaping, and whitespace removal. It
|
| + * parses the end-of-rule character. It recognizes context and cursor
|
| + * indicators. Once it does a lexical breakdown of the rule at pos, it
|
| + * creates a rule object and adds it to our rule list.
|
| + * @param rules Output param to receive the rules.
|
| + * @param pos the starting position.
|
| + * @param limit pointer past the last character of the rule.
|
| + * @return the index after the last character parsed.
|
| + */
|
| + int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
|
| +
|
| + /**
|
| + * Set the variable range to [start, end] (inclusive).
|
| + * @param start the start value of the range.
|
| + * @param end the end value of the range.
|
| + */
|
| + void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
|
| +
|
| + /**
|
| + * Assert that the given character is NOT within the variable range.
|
| + * If it is, return FALSE. This is neccesary to ensure that the
|
| + * variable range does not overlap characters used in a rule.
|
| + * @param ch the given character.
|
| + * @return True, if the given character is NOT within the variable range.
|
| + */
|
| + UBool checkVariableRange(UChar32 ch) const;
|
| +
|
| + /**
|
| + * Set the maximum backup to 'backup', in response to a pragma
|
| + * statement.
|
| + * @param backup the new value to be set.
|
| + */
|
| + void pragmaMaximumBackup(int32_t backup);
|
| +
|
| + /**
|
| + * Begin normalizing all rules using the given mode, in response
|
| + * to a pragma statement.
|
| + * @param mode the given mode.
|
| + */
|
| + void pragmaNormalizeRules(UNormalizationMode mode);
|
| +
|
| + /**
|
| + * Return true if the given rule looks like a pragma.
|
| + * @param pos offset to the first non-whitespace character
|
| + * of the rule.
|
| + * @param limit pointer past the last character of the rule.
|
| + * @return true if the given rule looks like a pragma.
|
| + */
|
| + static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
|
| +
|
| + /**
|
| + * Parse a pragma. This method assumes resemblesPragma() has
|
| + * already returned true.
|
| + * @param pos offset to the first non-whitespace character
|
| + * of the rule.
|
| + * @param limit pointer past the last character of the rule.
|
| + * @return the position index after the final ';' of the pragma,
|
| + * or -1 on failure.
|
| + */
|
| + int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
|
| +
|
| + /**
|
| + * Called by main parser upon syntax error. Search the rule string
|
| + * for the probable end of the rule. Of course, if the error is that
|
| + * the end of rule marker is missing, then the rule end will not be found.
|
| + * In any case the rule start will be correctly reported.
|
| + * @param parseErrorCode error code.
|
| + * @param msg error description.
|
| + * @param start position of first character of current rule.
|
| + * @return start position of first character of current rule.
|
| + */
|
| + int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
|
| + UErrorCode& status);
|
| +
|
| + /**
|
| + * Parse a UnicodeSet out, store it, and return the stand-in character
|
| + * used to represent it.
|
| + *
|
| + * @param rule the rule for UnicodeSet.
|
| + * @param pos the position in pattern at which to start parsing.
|
| + * @return the stand-in character used to represent it.
|
| + */
|
| + UChar parseSet(const UnicodeString& rule,
|
| + ParsePosition& pos,
|
| + UErrorCode& status);
|
| +
|
| + /**
|
| + * Generate and return a stand-in for a new UnicodeFunctor. Store
|
| + * the matcher (adopt it).
|
| + * @param adopted the UnicodeFunctor to be adopted.
|
| + * @return a stand-in for a new UnicodeFunctor.
|
| + */
|
| + UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
|
| +
|
| + /**
|
| + * Return the standin for segment seg (1-based).
|
| + * @param seg the given segment.
|
| + * @return the standIn character for the given segment.
|
| + */
|
| + UChar getSegmentStandin(int32_t seg, UErrorCode& status);
|
| +
|
| + /**
|
| + * Set the object for segment seg (1-based).
|
| + * @param seg the given segment.
|
| + * @param adopted the StringMatcher to be adopted.
|
| + */
|
| + void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
|
| +
|
| + /**
|
| + * Return the stand-in for the dot set. It is allocated the first
|
| + * time and reused thereafter.
|
| + * @return the stand-in for the dot set.
|
| + */
|
| + UChar getDotStandIn(UErrorCode& status);
|
| +
|
| + /**
|
| + * Append the value of the given variable name to the given
|
| + * UnicodeString.
|
| + * @param name the variable name to be appended.
|
| + * @param buf the given UnicodeString to append to.
|
| + */
|
| + void appendVariableDef(const UnicodeString& name,
|
| + UnicodeString& buf,
|
| + UErrorCode& status);
|
| +
|
| + /**
|
| + * Glue method to get around access restrictions in C++.
|
| + */
|
| + /*static Transliterator* createBasicInstance(const UnicodeString& id,
|
| + const UnicodeString* canonID);*/
|
| +
|
| + friend class RuleHalf;
|
| +
|
| + // Disallowed methods; no impl.
|
| + /**
|
| + * Copy constructor
|
| + */
|
| + TransliteratorParser(const TransliteratorParser&);
|
| +
|
| + /**
|
| + * Assignment operator
|
| + */
|
| + TransliteratorParser& operator=(const TransliteratorParser&);
|
| +};
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif /* #ifdef XP_CPLUSPLUS */
|
| +
|
| +/**
|
| + * Strip/convert the following from the transliterator rules:
|
| + * comments
|
| + * newlines
|
| + * white space at the beginning and end of a line
|
| + * unescape \u notation
|
| + *
|
| + * The target must be equal in size as the source.
|
| + * @internal
|
| + */
|
| +U_CAPI int32_t
|
| +utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
|
| +
|
| +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|
| +
|
| +#endif
|
|
|
| Property changes on: icu46/source/i18n/rbt_pars.h
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|