icu46/source/i18n/rbt_rule.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/rbt_rule.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/rbt_rule.cpp

===================================================================

--- icu46/source/i18n/rbt_rule.cpp (revision 0)

+++ icu46/source/i18n/rbt_rule.cpp (revision 0)

@@ -0,0 +1,556 @@

+/*

+ **********************************************************************

+ * Date Name Description

+ * 11/17/99 aliu Creation.

+ **********************************************************************

+ */

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_TRANSLITERATION

+#include "unicode/rep.h"

+#include "unicode/unifilt.h"

+#include "unicode/uniset.h"

+#include "rbt_rule.h"

+#include "rbt_data.h"

+#include "cmemory.h"

+#include "strmatch.h"

+#include "strrepl.h"

+#include "util.h"

+#include "putilimp.h"

+static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "

+U_NAMESPACE_BEGIN

+/**

+ * Construct a new rule with the given input, output text, and other

+ * attributes. A cursor position may be specified for the output text.

+ * @param input input string, including key and optional ante and

+ * post context

+ * @param anteContextPos offset into input to end of ante context, or -1 if

+ * none. Must be <= input.length() if not -1.

+ * @param postContextPos offset into input to start of post context, or -1

+ * if none. Must be <= input.length() if not -1, and must be >=

+ * anteContextPos.

+ * @param output output string

+ * @param cursorPosition offset into output at which cursor is located, or -1 if

+ * none. If less than zero, then the cursor is placed after the

+ * <code>output</code>; that is, -1 is equivalent to

+ * <code>output.length()</code>. If greater than

+ * <code>output.length()</code> then an exception is thrown.

+ * @param segs array of UnicodeFunctors corresponding to input pattern

+ * segments, or null if there are none. The array itself is adopted,

+ * but the pointers within it are not.

+ * @param segsCount number of elements in segs[]

+ * @param anchorStart TRUE if the the rule is anchored on the left to

+ * the context start

+ * @param anchorEnd TRUE if the rule is anchored on the right to the

+ * context limit

+ */

+TransliterationRule::TransliterationRule(const UnicodeString& input,

+ int32_t anteContextPos, int32_t postContextPos,

+ const UnicodeString& outputStr,

+ int32_t cursorPosition, int32_t cursorOffset,

+ UnicodeFunctor** segs,

+ int32_t segsCount,

+ UBool anchorStart, UBool anchorEnd,

+ const TransliterationRuleData* theData,

+ UErrorCode& status) :

+ UMemory(),

+ segments(0),

+ data(theData) {

+ if (U_FAILURE(status)) {

+ return;

+ }

+ // Do range checks only when warranted to save time

+ if (anteContextPos < 0) {

+ anteContextLength = 0;

+ } else {

+ if (anteContextPos > input.length()) {

+ // throw new IllegalArgumentException("Invalid ante context");

+ status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ anteContextLength = anteContextPos;

+ }

+ if (postContextPos < 0) {

+ keyLength = input.length() - anteContextLength;

+ } else {

+ if (postContextPos < anteContextLength ||

+ postContextPos > input.length()) {

+ // throw new IllegalArgumentException("Invalid post context");

+ status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ keyLength = postContextPos - anteContextLength;

+ }

+ if (cursorPosition < 0) {

+ cursorPosition = outputStr.length();

+ } else if (cursorPosition > outputStr.length()) {

+ // throw new IllegalArgumentException("Invalid cursor position");

+ status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ // We don't validate the segments array. The caller must

+ // guarantee that the segments are well-formed (that is, that

+ // all $n references in the output refer to indices of this

+ // array, and that no array elements are null).

+ this->segments = segs;

+ this->segmentsCount = segsCount;

+ pattern = input;

+ flags = 0;

+ if (anchorStart) {

+ flags |= ANCHOR_START;

+ }

+ if (anchorEnd) {

+ flags |= ANCHOR_END;

+ }

+ anteContext = NULL;

+ if (anteContextLength > 0) {

+ anteContext = new StringMatcher(pattern, 0, anteContextLength,

+ FALSE, *data);

+ /* test for NULL */

+ if (anteContext == 0) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ key = NULL;

+ if (keyLength > 0) {

+ key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,

+ FALSE, *data);

+ /* test for NULL */

+ if (key == 0) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ int32_t postContextLength = pattern.length() - keyLength - anteContextLength;

+ postContext = NULL;

+ if (postContextLength > 0) {

+ postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),

+ FALSE, *data);

+ /* test for NULL */

+ if (postContext == 0) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);

+ /* test for NULL */

+ if (this->output == 0) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+/**

+ * Copy constructor.

+ */

+TransliterationRule::TransliterationRule(TransliterationRule& other) :

+ UMemory(other),

+ anteContext(NULL),

+ key(NULL),

+ postContext(NULL),

+ pattern(other.pattern),

+ anteContextLength(other.anteContextLength),

+ keyLength(other.keyLength),

+ flags(other.flags),

+ data(other.data) {

+ segments = NULL;

+ segmentsCount = 0;

+ if (other.segmentsCount > 0) {

+ segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));

+ uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));

+ }

+ if (other.anteContext != NULL) {

+ anteContext = (StringMatcher*) other.anteContext->clone();

+ }

+ if (other.key != NULL) {

+ key = (StringMatcher*) other.key->clone();

+ }

+ if (other.postContext != NULL) {

+ postContext = (StringMatcher*) other.postContext->clone();

+ }

+ output = other.output->clone();

+TransliterationRule::~TransliterationRule() {

+ uprv_free(segments);

+ delete anteContext;

+ delete key;

+ delete postContext;

+ delete output;

+/**

+ * Return the preceding context length. This method is needed to

+ * support the <code>Transliterator</code> method

+ * <code>getMaximumContextLength()</code>. Internally, this is

+ * implemented as the anteContextLength, optionally plus one if

+ * there is a start anchor. The one character anchor gap is

+ * needed to make repeated incremental transliteration with

+ * anchors work.

+ */

+int32_t TransliterationRule::getContextLength(void) const {

+ return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);

+/**

+ * Internal method. Returns 8-bit index value for this rule.

+ * This is the low byte of the first character of the key,

+ * unless the first character of the key is a set. If it's a

+ * set, or otherwise can match multiple keys, the index value is -1.

+ */

+int16_t TransliterationRule::getIndexValue() const {

+ if (anteContextLength == pattern.length()) {

+ // A pattern with just ante context {such as foo)>bar} can

+ // match any key.

+ return -1;

+ }

+ UChar32 c = pattern.char32At(anteContextLength);

+ return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);

+/**

+ * Internal method. Returns true if this rule matches the given

+ * index value. The index value is an 8-bit integer, 0..255,

+ * representing the low byte of the first character of the key.

+ * It matches this rule if it matches the first character of the

+ * key, or if the first character of the key is a set, and the set

+ * contains any character with a low byte equal to the index

+ * value. If the rule contains only ante context, as in foo)>bar,

+ * then it will match any key.

+ */

+UBool TransliterationRule::matchesIndexValue(uint8_t v) const {

+ // Delegate to the key, or if there is none, to the postContext.

+ // If there is neither then we match any key; return true.

+ UnicodeMatcher *m = (key != NULL) ? key : postContext;

+ return (m != NULL) ? m->matchesIndexValue(v) : TRUE;

+/**

+ * Return true if this rule masks another rule. If r1 masks r2 then

+ * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks

+ * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".

+ * "[c]a>x" masks "[dc]a>y".

+ */

+UBool TransliterationRule::masks(const TransliterationRule& r2) const {

+ /* Rule r1 masks rule r2 if the string formed of the

+ * antecontext, key, and postcontext overlaps in the following

+ * way:

+ *

+ * r1: aakkkpppp

+ * r2: aaakkkkkpppp

+ * ^

+ *

+ * The strings must be aligned at the first character of the

+ * key. The length of r1 to the left of the alignment point

+ * must be <= the length of r2 to the left; ditto for the

+ * right. The characters of r1 must equal (or be a superset

+ * of) the corresponding characters of r2. The superset

+ * operation should be performed to check for UnicodeSet

+ * masking.

+ *

+ * Anchors: Two patterns that differ only in anchors only

+ * mask one another if they are exactly equal, and r2 has

+ * all the anchors r1 has (optionally, plus some). Here Y

+ * means the row masks the column, N means it doesn't.

+ *

+ * ab ^ab ab$ ^ab$

+ * ab Y Y Y Y

+ * ^ab N Y N Y

+ * ab$ N N Y Y

+ * ^ab$ N N N Y

+ *

+ * Post context: {a}b masks ab, but not vice versa, since {a}b

+ * matches everything ab matches, and {a}b matches {|a|}b but ab

+ * does not. Pre context is different (a{b} does not align with

+ * ab).

+ */

+ /* LIMITATION of the current mask algorithm: Some rule

+ * maskings are currently not detected. For example,

+ * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO

+ */

+ int32_t len = pattern.length();

+ int32_t left = anteContextLength;

+ int32_t left2 = r2.anteContextLength;

+ int32_t right = len - left;

+ int32_t right2 = r2.pattern.length() - left2;

+ int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);

+ // TODO Clean this up -- some logic might be combinable with the

+ // next statement.

+ // Test for anchor masking

+ if (left == left2 && right == right2 &&

+ keyLength <= r2.keyLength &&

+ 0 == cachedCompare) {

+ // The following boolean logic implements the table above

+ return (flags == r2.flags) ||

+ (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||

+ ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));

+ }

+ return left <= left2 &&

+ (right < right2 ||

+ (right == right2 && keyLength <= r2.keyLength)) &&

+ (0 == cachedCompare);

+static inline int32_t posBefore(const Replaceable& str, int32_t pos) {

+ return (pos > 0) ?

+ pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) :

+ pos - 1;

+static inline int32_t posAfter(const Replaceable& str, int32_t pos) {

+ return (pos >= 0 && pos < str.length()) ?

+ pos + UTF_CHAR_LENGTH(str.char32At(pos)) :

+ pos + 1;

+/**

+ * Attempt a match and replacement at the given position. Return

+ * the degree of match between this rule and the given text. The

+ * degree of match may be mismatch, a partial match, or a full

+ * match. A mismatch means at least one character of the text

+ * does not match the context or key. A partial match means some

+ * context and key characters match, but the text is not long

+ * enough to match all of them. A full match means all context

+ * and key characters match.

+ *

+ * If a full match is obtained, perform a replacement, update pos,

+ * and return U_MATCH. Otherwise both text and pos are unchanged.

+ *

+ * @param text the text

+ * @param pos the position indices

+ * @param incremental if TRUE, test for partial matches that may

+ * be completed by additional text inserted at pos.limit.

+ * @return one of <code>U_MISMATCH</code>,

+ * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If

+ * incremental is FALSE then U_PARTIAL_MATCH will not be returned.

+ */

+UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,

+ UTransPosition& pos,

+ UBool incremental) const {

+ // Matching and replacing are done in one method because the

+ // replacement operation needs information obtained during the

+ // match. Another way to do this is to have the match method

+ // create a match result struct with relevant offsets, and to pass

+ // this into the replace method.

+ // ============================ MATCH ===========================

+ // Reset segment match data

+ if (segments != NULL) {

+ for (int32_t i=0; i<segmentsCount; ++i) {

+ ((StringMatcher*) segments[i])->resetMatch();

+ }

+// int32_t lenDelta, keyLimit;

+ int32_t keyLimit;

+ // ------------------------ Ante Context ------------------------

+ // A mismatch in the ante context, or with the start anchor,

+ // is an outright U_MISMATCH regardless of whether we are

+ // incremental or not.

+ int32_t oText; // offset into 'text'

+// int32_t newStart = 0;

+ int32_t minOText;

+ // Note (1): We process text in 16-bit code units, rather than

+ // 32-bit code points. This works because stand-ins are

+ // always in the BMP and because we are doing a literal match

+ // operation, which can be done 16-bits at a time.

+ int32_t anteLimit = posBefore(text, pos.contextStart);

+ UMatchDegree match;

+ // Start reverse match at char before pos.start

+ oText = posBefore(text, pos.start);

+ if (anteContext != NULL) {

+ match = anteContext->matches(text, oText, anteLimit, FALSE);

+ if (match != U_MATCH) {

+ return U_MISMATCH;

+ }

+ minOText = posAfter(text, oText);

+ // ------------------------ Start Anchor ------------------------

+ if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {

+ return U_MISMATCH;

+ }

+ // -------------------- Key and Post Context --------------------

+ oText = pos.start;

+ if (key != NULL) {

+ match = key->matches(text, oText, pos.limit, incremental);

+ if (match != U_MATCH) {

+ return match;

+ }

+ keyLimit = oText;

+ if (postContext != NULL) {

+ if (incremental && keyLimit == pos.limit) {

+ // The key matches just before pos.limit, and there is

+ // a postContext. Since we are in incremental mode,

+ // we must assume more characters may be inserted at

+ // pos.limit -- this is a partial match.

+ return U_PARTIAL_MATCH;

+ }

+ match = postContext->matches(text, oText, pos.contextLimit, incremental);

+ if (match != U_MATCH) {

+ return match;

+ }

+ // ------------------------- Stop Anchor ------------------------

+ if (((flags & ANCHOR_END)) != 0) {

+ if (oText != pos.contextLimit) {

+ return U_MISMATCH;

+ }

+ if (incremental) {

+ return U_PARTIAL_MATCH;

+ }

+ // =========================== REPLACE ==========================

+ // We have a full match. The key is between pos.start and

+ // keyLimit.

+ int32_t newStart;

+ int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);

+ int32_t lenDelta = newLength - (keyLimit - pos.start);

+ oText += lenDelta;

+ pos.limit += lenDelta;

+ pos.contextLimit += lenDelta;

+ // Restrict new value of start to [minOText, min(oText, pos.limit)].

+ pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));

+ return U_MATCH;

+/**

+ * Create a source string that represents this rule. Append it to the

+ * given string.

+ */

+UnicodeString& TransliterationRule::toRule(UnicodeString& rule,

+ UBool escapeUnprintable) const {

+ // Accumulate special characters (and non-specials following them)

+ // into quoteBuf. Append quoteBuf, within single quotes, when

+ // a non-quoted element must be inserted.

+ UnicodeString str, quoteBuf;

+ // Do not emit the braces '{' '}' around the pattern if there

+ // is neither anteContext nor postContext.

+ UBool emitBraces =

+ (anteContext != NULL) || (postContext != NULL);

+ // Emit start anchor

+ if ((flags & ANCHOR_START) != 0) {

+ rule.append((UChar)94/*^*/);

+ }

+ // Emit the input pattern

+ ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);

+ if (emitBraces) {

+ ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);

+ }

+ ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);

+ if (emitBraces) {

+ ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);

+ }

+ ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);

+ // Emit end anchor

+ if ((flags & ANCHOR_END) != 0) {

+ rule.append((UChar)36/*$*/);

+ }

+ ICU_Utility::appendToRule(rule, FORWARD_OP, TRUE, escapeUnprintable, quoteBuf);

+ // Emit the output pattern

+ ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),

+ TRUE, escapeUnprintable, quoteBuf);

+ ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);

+ return rule;

+void TransliterationRule::setData(const TransliterationRuleData* d) {

+ data = d;

+ if (anteContext != NULL) anteContext->setData(d);

+ if (postContext != NULL) postContext->setData(d);

+ if (key != NULL) key->setData(d);

+ // assert(output != NULL);

+ output->setData(d);

+ // Don't have to do segments since they are in the context or key

+/**

+ * Union the set of all characters that may be modified by this rule

+ * into the given set.

+ */

+void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {

+ int32_t limit = anteContextLength + keyLength;

+ for (int32_t i=anteContextLength; i<limit; ) {

+ UChar32 ch = pattern.char32At(i);

+ i += UTF_CHAR_LENGTH(ch);

+ const UnicodeMatcher* matcher = data->lookupMatcher(ch);

+ if (matcher == NULL) {

+ toUnionTo.add(ch);

+ } else {

+ matcher->addMatchSetTo(toUnionTo);

+ }

+/**

+ * Union the set of all characters that may be emitted by this rule

+ * into the given set.

+ */

+void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {

+ output->toReplacer()->addReplacementSetTo(toUnionTo);

+U_NAMESPACE_END

+#endif /* #if !UCONFIG_NO_TRANSLITERATION */

+//eof

Property changes on: icu46/source/i18n/rbt_rule.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/rbt_rule.h ('k') | icu46/source/i18n/rbt_set.h » ('j') | no next file with comments »