Index: icu46/source/i18n/rbt_rule.h |
=================================================================== |
--- icu46/source/i18n/rbt_rule.h (revision 0) |
+++ icu46/source/i18n/rbt_rule.h (revision 0) |
@@ -0,0 +1,308 @@ |
+/* |
+* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. |
+********************************************************************** |
+* Date Name Description |
+* 11/17/99 aliu Creation. |
+********************************************************************** |
+*/ |
+#ifndef RBT_RULE_H |
+#define RBT_RULE_H |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_TRANSLITERATION |
+ |
+#include "unicode/uobject.h" |
+#include "unicode/unistr.h" |
+#include "unicode/utrans.h" |
+#include "unicode/unimatch.h" |
+ |
+U_NAMESPACE_BEGIN |
+ |
+class Replaceable; |
+class TransliterationRuleData; |
+class StringMatcher; |
+class UnicodeFunctor; |
+ |
+/** |
+ * A transliteration rule used by |
+ * <code>RuleBasedTransliterator</code>. |
+ * <code>TransliterationRule</code> is an immutable object. |
+ * |
+ * <p>A rule consists of an input pattern and an output string. When |
+ * the input pattern is matched, the output string is emitted. The |
+ * input pattern consists of zero or more characters which are matched |
+ * exactly (the key) and optional context. Context must match if it |
+ * is specified. Context may be specified before the key, after the |
+ * key, or both. The key, preceding context, and following context |
+ * may contain variables. Variables represent a set of Unicode |
+ * characters, such as the letters <i>a</i> through <i>z</i>. |
+ * Variables are detected by looking up each character in a supplied |
+ * variable list to see if it has been so defined. |
+ * |
+ * <p>A rule may contain segments in its input string and segment |
+ * references in its output string. A segment is a substring of the |
+ * input pattern, indicated by an offset and limit. The segment may |
+ * be in the preceding or following context. It may not span a |
+ * context boundary. A segment reference is a special character in |
+ * the output string that causes a segment of the input string (not |
+ * the input pattern) to be copied to the output string. The range of |
+ * special characters that represent segment references is defined by |
+ * RuleBasedTransliterator.Data. |
+ * |
+ * @author Alan Liu |
+ */ |
+class TransliterationRule : public UMemory { |
+ |
+private: |
+ |
+ // TODO Eliminate the pattern and keyLength data members. They |
+ // are used only by masks() and getIndexValue() which are called |
+ // only during build time, not during run-time. Perhaps these |
+ // methods and pattern/keyLength can be isolated into a separate |
+ // object. |
+ |
+ /** |
+ * The match that must occur before the key, or null if there is no |
+ * preceding context. |
+ */ |
+ StringMatcher *anteContext; |
+ |
+ /** |
+ * The matcher object for the key. If null, then the key is empty. |
+ */ |
+ StringMatcher *key; |
+ |
+ /** |
+ * The match that must occur after the key, or null if there is no |
+ * following context. |
+ */ |
+ StringMatcher *postContext; |
+ |
+ /** |
+ * The object that performs the replacement if the key, |
+ * anteContext, and postContext are matched. Never null. |
+ */ |
+ UnicodeFunctor* output; |
+ |
+ /** |
+ * The string that must be matched, consisting of the anteContext, key, |
+ * and postContext, concatenated together, in that order. Some components |
+ * may be empty (zero length). |
+ * @see anteContextLength |
+ * @see keyLength |
+ */ |
+ UnicodeString pattern; |
+ |
+ /** |
+ * An array of matcher objects corresponding to the input pattern |
+ * segments. If there are no segments this is null. N.B. This is |
+ * a UnicodeMatcher for generality, but in practice it is always a |
+ * StringMatcher. In the future we may generalize this, but for |
+ * now we sometimes cast down to StringMatcher. |
+ * |
+ * The array is owned, but the pointers within it are not. |
+ */ |
+ UnicodeFunctor** segments; |
+ |
+ /** |
+ * The number of elements in segments[] or zero if segments is NULL. |
+ */ |
+ int32_t segmentsCount; |
+ |
+ /** |
+ * The length of the string that must match before the key. If |
+ * zero, then there is no matching requirement before the key. |
+ * Substring [0,anteContextLength) of pattern is the anteContext. |
+ */ |
+ int32_t anteContextLength; |
+ |
+ /** |
+ * The length of the key. Substring [anteContextLength, |
+ * anteContextLength + keyLength) is the key. |
+ |
+ */ |
+ int32_t keyLength; |
+ |
+ /** |
+ * Miscellaneous attributes. |
+ */ |
+ int8_t flags; |
+ |
+ /** |
+ * Flag attributes. |
+ */ |
+ enum { |
+ ANCHOR_START = 1, |
+ ANCHOR_END = 2 |
+ }; |
+ |
+ /** |
+ * An alias pointer to the data for this rule. The data provides |
+ * lookup services for matchers and segments. |
+ */ |
+ const TransliterationRuleData* data; |
+ |
+public: |
+ |
+ /** |
+ * Construct a new rule with the given input, output text, and other |
+ * attributes. A cursor position may be specified for the output text. |
+ * @param input input string, including key and optional ante and |
+ * post context. |
+ * @param anteContextPos offset into input to end of ante context, or -1 if |
+ * none. Must be <= input.length() if not -1. |
+ * @param postContextPos offset into input to start of post context, or -1 |
+ * if none. Must be <= input.length() if not -1, and must be >= |
+ * anteContextPos. |
+ * @param outputStr output string. |
+ * @param cursorPosition offset into output at which cursor is located, or -1 if |
+ * none. If less than zero, then the cursor is placed after the |
+ * <code>output</code>; that is, -1 is equivalent to |
+ * <code>output.length()</code>. If greater than |
+ * <code>output.length()</code> then an exception is thrown. |
+ * @param cursorOffset an offset to be added to cursorPos to position the |
+ * cursor either in the ante context, if < 0, or in the post context, if > |
+ * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to |
+ * "xyz" and moves the cursor to before "a". It would have a cursorOffset |
+ * of -3. |
+ * @param segs array of UnicodeMatcher corresponding to input pattern |
+ * segments, or null if there are none. The array itself is adopted, |
+ * but the pointers within it are not. |
+ * @param segsCount number of elements in segs[]. |
+ * @param anchorStart TRUE if the the rule is anchored on the left to |
+ * the context start. |
+ * @param anchorEnd TRUE if the rule is anchored on the right to the |
+ * context limit. |
+ * @param data the rule data. |
+ * @param status Output parameter filled in with success or failure status. |
+ */ |
+ TransliterationRule(const UnicodeString& input, |
+ int32_t anteContextPos, int32_t postContextPos, |
+ const UnicodeString& outputStr, |
+ int32_t cursorPosition, int32_t cursorOffset, |
+ UnicodeFunctor** segs, |
+ int32_t segsCount, |
+ UBool anchorStart, UBool anchorEnd, |
+ const TransliterationRuleData* data, |
+ UErrorCode& status); |
+ |
+ /** |
+ * Copy constructor. |
+ * @param other the object to be copied. |
+ */ |
+ TransliterationRule(TransliterationRule& other); |
+ |
+ /** |
+ * Destructor. |
+ */ |
+ virtual ~TransliterationRule(); |
+ |
+ /** |
+ * Change the data object that this rule belongs to. Used |
+ * internally by the TransliterationRuleData copy constructor. |
+ * @param data the new data value to be set. |
+ */ |
+ void setData(const TransliterationRuleData* data); |
+ |
+ /** |
+ * Return the preceding context length. This method is needed to |
+ * support the <code>Transliterator</code> method |
+ * <code>getMaximumContextLength()</code>. Internally, this is |
+ * implemented as the anteContextLength, optionally plus one if |
+ * there is a start anchor. The one character anchor gap is |
+ * needed to make repeated incremental transliteration with |
+ * anchors work. |
+ * @return the preceding context length. |
+ */ |
+ virtual int32_t getContextLength(void) const; |
+ |
+ /** |
+ * Internal method. Returns 8-bit index value for this rule. |
+ * This is the low byte of the first character of the key, |
+ * unless the first character of the key is a set. If it's a |
+ * set, or otherwise can match multiple keys, the index value is -1. |
+ * @return 8-bit index value for this rule. |
+ */ |
+ int16_t getIndexValue() const; |
+ |
+ /** |
+ * Internal method. Returns true if this rule matches the given |
+ * index value. The index value is an 8-bit integer, 0..255, |
+ * representing the low byte of the first character of the key. |
+ * It matches this rule if it matches the first character of the |
+ * key, or if the first character of the key is a set, and the set |
+ * contains any character with a low byte equal to the index |
+ * value. If the rule contains only ante context, as in foo)>bar, |
+ * then it will match any key. |
+ * @param v the given index value. |
+ * @return true if this rule matches the given index value. |
+ */ |
+ UBool matchesIndexValue(uint8_t v) const; |
+ |
+ /** |
+ * Return true if this rule masks another rule. If r1 masks r2 then |
+ * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks |
+ * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". |
+ * "[c]a>x" masks "[dc]a>y". |
+ * @param r2 the given rule to be compared with. |
+ * @return true if this rule masks 'r2' |
+ */ |
+ virtual UBool masks(const TransliterationRule& r2) const; |
+ |
+ /** |
+ * Attempt a match and replacement at the given position. Return |
+ * the degree of match between this rule and the given text. The |
+ * degree of match may be mismatch, a partial match, or a full |
+ * match. A mismatch means at least one character of the text |
+ * does not match the context or key. A partial match means some |
+ * context and key characters match, but the text is not long |
+ * enough to match all of them. A full match means all context |
+ * and key characters match. |
+ * |
+ * If a full match is obtained, perform a replacement, update pos, |
+ * and return U_MATCH. Otherwise both text and pos are unchanged. |
+ * |
+ * @param text the text |
+ * @param pos the position indices |
+ * @param incremental if TRUE, test for partial matches that may |
+ * be completed by additional text inserted at pos.limit. |
+ * @return one of <code>U_MISMATCH</code>, |
+ * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If |
+ * incremental is FALSE then U_PARTIAL_MATCH will not be returned. |
+ */ |
+ UMatchDegree matchAndReplace(Replaceable& text, |
+ UTransPosition& pos, |
+ UBool incremental) const; |
+ |
+ /** |
+ * Create a rule string that represents this rule object. Append |
+ * it to the given string. |
+ */ |
+ virtual UnicodeString& toRule(UnicodeString& pat, |
+ UBool escapeUnprintable) const; |
+ |
+ /** |
+ * Union the set of all characters that may be modified by this rule |
+ * into the given set. |
+ */ |
+ void addSourceSetTo(UnicodeSet& toUnionTo) const; |
+ |
+ /** |
+ * Union the set of all characters that may be emitted by this rule |
+ * into the given set. |
+ */ |
+ void addTargetSetTo(UnicodeSet& toUnionTo) const; |
+ |
+ private: |
+ |
+ friend class StringMatcher; |
+ |
+ TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class |
+}; |
+ |
+U_NAMESPACE_END |
+ |
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
+ |
+#endif |
Property changes on: icu46/source/i18n/rbt_rule.h |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |