Index: icu46/source/i18n/strrepl.cpp |
=================================================================== |
--- icu46/source/i18n/strrepl.cpp (revision 0) |
+++ icu46/source/i18n/strrepl.cpp (revision 0) |
@@ -0,0 +1,328 @@ |
+/* |
+********************************************************************** |
+* Copyright (c) 2002-2004, International Business Machines Corporation |
+* and others. All Rights Reserved. |
+********************************************************************** |
+* Date Name Description |
+* 01/21/2002 aliu Creation. |
+********************************************************************** |
+*/ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_TRANSLITERATION |
+ |
+#include "strrepl.h" |
+#include "rbt_data.h" |
+#include "util.h" |
+#include "unicode/uniset.h" |
+ |
+U_NAMESPACE_BEGIN |
+ |
+static const UChar EMPTY[] = { 0 }; // empty string: "" |
+ |
+UnicodeReplacer::~UnicodeReplacer() {} |
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) |
+ |
+/** |
+ * Construct a StringReplacer that sets the emits the given output |
+ * text and sets the cursor to the given position. |
+ * @param theOutput text that will replace input text when the |
+ * replace() method is called. May contain stand-in characters |
+ * that represent nested replacers. |
+ * @param theCursorPos cursor position that will be returned by |
+ * the replace() method |
+ * @param theData transliterator context object that translates |
+ * stand-in characters to UnicodeReplacer objects |
+ */ |
+StringReplacer::StringReplacer(const UnicodeString& theOutput, |
+ int32_t theCursorPos, |
+ const TransliterationRuleData* theData) { |
+ output = theOutput; |
+ cursorPos = theCursorPos; |
+ hasCursor = TRUE; |
+ data = theData; |
+ isComplex = TRUE; |
+} |
+ |
+/** |
+ * Construct a StringReplacer that sets the emits the given output |
+ * text and does not modify the cursor. |
+ * @param theOutput text that will replace input text when the |
+ * replace() method is called. May contain stand-in characters |
+ * that represent nested replacers. |
+ * @param theData transliterator context object that translates |
+ * stand-in characters to UnicodeReplacer objects |
+ */ |
+StringReplacer::StringReplacer(const UnicodeString& theOutput, |
+ const TransliterationRuleData* theData) { |
+ output = theOutput; |
+ cursorPos = 0; |
+ hasCursor = FALSE; |
+ data = theData; |
+ isComplex = TRUE; |
+} |
+ |
+/** |
+ * Copy constructor. |
+ */ |
+StringReplacer::StringReplacer(const StringReplacer& other) : |
+ UnicodeFunctor(other), |
+ UnicodeReplacer(other) |
+{ |
+ output = other.output; |
+ cursorPos = other.cursorPos; |
+ hasCursor = other.hasCursor; |
+ data = other.data; |
+ isComplex = other.isComplex; |
+} |
+ |
+/** |
+ * Destructor |
+ */ |
+StringReplacer::~StringReplacer() { |
+} |
+ |
+/** |
+ * Implement UnicodeFunctor |
+ */ |
+UnicodeFunctor* StringReplacer::clone() const { |
+ return new StringReplacer(*this); |
+} |
+ |
+/** |
+ * Implement UnicodeFunctor |
+ */ |
+UnicodeReplacer* StringReplacer::toReplacer() const { |
+ return (UnicodeReplacer*) this; |
+} |
+ |
+/** |
+ * UnicodeReplacer API |
+ */ |
+int32_t StringReplacer::replace(Replaceable& text, |
+ int32_t start, |
+ int32_t limit, |
+ int32_t& cursor) { |
+ int32_t outLen; |
+ int32_t newStart = 0; |
+ |
+ // NOTE: It should be possible to _always_ run the complex |
+ // processing code; just slower. If not, then there is a bug |
+ // in the complex processing code. |
+ |
+ // Simple (no nested replacers) Processing Code : |
+ if (!isComplex) { |
+ text.handleReplaceBetween(start, limit, output); |
+ outLen = output.length(); |
+ |
+ // Setup default cursor position (for cursorPos within output) |
+ newStart = cursorPos; |
+ } |
+ |
+ // Complex (nested replacers) Processing Code : |
+ else { |
+ /* When there are segments to be copied, use the Replaceable.copy() |
+ * API in order to retain out-of-band data. Copy everything to the |
+ * end of the string, then copy them back over the key. This preserves |
+ * the integrity of indices into the key and surrounding context while |
+ * generating the output text. |
+ */ |
+ UnicodeString buf; |
+ int32_t oOutput; // offset into 'output' |
+ isComplex = FALSE; |
+ |
+ // The temporary buffer starts at tempStart, and extends |
+ // to destLimit. The start of the buffer has a single |
+ // character from before the key. This provides style |
+ // data when addition characters are filled into the |
+ // temporary buffer. If there is nothing to the left, use |
+ // the non-character U+FFFF, which Replaceable subclasses |
+ // should treat specially as a "no-style character." |
+ // destStart points to the point after the style context |
+ // character, so it is tempStart+1 or tempStart+2. |
+ int32_t tempStart = text.length(); // start of temp buffer |
+ int32_t destStart = tempStart; // copy new text to here |
+ if (start > 0) { |
+ int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1)); |
+ text.copy(start-len, start, tempStart); |
+ destStart += len; |
+ } else { |
+ UnicodeString str((UChar) 0xFFFF); |
+ text.handleReplaceBetween(tempStart, tempStart, str); |
+ destStart++; |
+ } |
+ int32_t destLimit = destStart; |
+ |
+ for (oOutput=0; oOutput<output.length(); ) { |
+ if (oOutput == cursorPos) { |
+ // Record the position of the cursor |
+ newStart = destLimit - destStart; // relative to start |
+ } |
+ UChar32 c = output.char32At(oOutput); |
+ UnicodeReplacer* r = data->lookupReplacer(c); |
+ if (r == NULL) { |
+ // Accumulate straight (non-segment) text. |
+ buf.append(c); |
+ } else { |
+ isComplex = TRUE; |
+ |
+ // Insert any accumulated straight text. |
+ if (buf.length() > 0) { |
+ text.handleReplaceBetween(destLimit, destLimit, buf); |
+ destLimit += buf.length(); |
+ buf.truncate(0); |
+ } |
+ |
+ // Delegate output generation to replacer object |
+ int32_t len = r->replace(text, destLimit, destLimit, cursor); |
+ destLimit += len; |
+ } |
+ oOutput += UTF_CHAR_LENGTH(c); |
+ } |
+ // Insert any accumulated straight text. |
+ if (buf.length() > 0) { |
+ text.handleReplaceBetween(destLimit, destLimit, buf); |
+ destLimit += buf.length(); |
+ } |
+ if (oOutput == cursorPos) { |
+ // Record the position of the cursor |
+ newStart = destLimit - destStart; // relative to start |
+ } |
+ |
+ outLen = destLimit - destStart; |
+ |
+ // Copy new text to start, and delete it |
+ text.copy(destStart, destLimit, start); |
+ text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY); |
+ |
+ // Delete the old text (the key) |
+ text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY); |
+ } |
+ |
+ if (hasCursor) { |
+ // Adjust the cursor for positions outside the key. These |
+ // refer to code points rather than code units. If cursorPos |
+ // is within the output string, then use newStart, which has |
+ // already been set above. |
+ if (cursorPos < 0) { |
+ newStart = start; |
+ int32_t n = cursorPos; |
+ // Outside the output string, cursorPos counts code points |
+ while (n < 0 && newStart > 0) { |
+ newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1)); |
+ ++n; |
+ } |
+ newStart += n; |
+ } else if (cursorPos > output.length()) { |
+ newStart = start + outLen; |
+ int32_t n = cursorPos - output.length(); |
+ // Outside the output string, cursorPos counts code points |
+ while (n > 0 && newStart < text.length()) { |
+ newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); |
+ --n; |
+ } |
+ newStart += n; |
+ } else { |
+ // Cursor is within output string. It has been set up above |
+ // to be relative to start. |
+ newStart += start; |
+ } |
+ |
+ cursor = newStart; |
+ } |
+ |
+ return outLen; |
+} |
+ |
+/** |
+ * UnicodeReplacer API |
+ */ |
+UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, |
+ UBool escapeUnprintable) const { |
+ rule.truncate(0); |
+ UnicodeString quoteBuf; |
+ |
+ int32_t cursor = cursorPos; |
+ |
+ // Handle a cursor preceding the output |
+ if (hasCursor && cursor < 0) { |
+ while (cursor++ < 0) { |
+ ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
+ } |
+ // Fall through and append '|' below |
+ } |
+ |
+ for (int32_t i=0; i<output.length(); ++i) { |
+ if (hasCursor && i == cursor) { |
+ ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
+ } |
+ UChar c = output.charAt(i); // Ok to use 16-bits here |
+ |
+ UnicodeReplacer* r = data->lookupReplacer(c); |
+ if (r == NULL) { |
+ ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); |
+ } else { |
+ UnicodeString buf; |
+ r->toReplacerPattern(buf, escapeUnprintable); |
+ buf.insert(0, (UChar)0x20); |
+ buf.append((UChar)0x20); |
+ ICU_Utility::appendToRule(rule, buf, |
+ TRUE, escapeUnprintable, quoteBuf); |
+ } |
+ } |
+ |
+ // Handle a cursor after the output. Use > rather than >= because |
+ // if cursor == output.length() it is at the end of the output, |
+ // which is the default position, so we need not emit it. |
+ if (hasCursor && cursor > output.length()) { |
+ cursor -= output.length(); |
+ while (cursor-- > 0) { |
+ ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
+ } |
+ ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
+ } |
+ // Flush quoteBuf out to result |
+ ICU_Utility::appendToRule(rule, -1, |
+ TRUE, escapeUnprintable, quoteBuf); |
+ |
+ return rule; |
+} |
+ |
+/** |
+ * Implement UnicodeReplacer |
+ */ |
+void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { |
+ UChar32 ch; |
+ for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) { |
+ ch = output.char32At(i); |
+ UnicodeReplacer* r = data->lookupReplacer(ch); |
+ if (r == NULL) { |
+ toUnionTo.add(ch); |
+ } else { |
+ r->addReplacementSetTo(toUnionTo); |
+ } |
+ } |
+} |
+ |
+/** |
+ * UnicodeFunctor API |
+ */ |
+void StringReplacer::setData(const TransliterationRuleData* d) { |
+ data = d; |
+ int32_t i = 0; |
+ while (i<output.length()) { |
+ UChar32 c = output.char32At(i); |
+ UnicodeFunctor* f = data->lookup(c); |
+ if (f != NULL) { |
+ f->setData(data); |
+ } |
+ i += UTF_CHAR_LENGTH(c); |
+ } |
+} |
+ |
+U_NAMESPACE_END |
+ |
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
+ |
+//eof |
Property changes on: icu46/source/i18n/strrepl.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |