Index: icu46/source/i18n/cpdtrans.cpp |
=================================================================== |
--- icu46/source/i18n/cpdtrans.cpp (revision 0) |
+++ icu46/source/i18n/cpdtrans.cpp (revision 0) |
@@ -0,0 +1,616 @@ |
+/* |
+********************************************************************** |
+* Copyright (C) 1999-2008, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+********************************************************************** |
+* Date Name Description |
+* 11/17/99 aliu Creation. |
+********************************************************************** |
+*/ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_TRANSLITERATION |
+ |
+#include "unicode/unifilt.h" |
+#include "unicode/uniset.h" |
+#include "cpdtrans.h" |
+#include "uvector.h" |
+#include "tridpars.h" |
+#include "cmemory.h" |
+ |
+// keep in sync with Transliterator |
+//static const UChar ID_SEP = 0x002D; /*-*/ |
+static const UChar ID_DELIM = 0x003B; /*;*/ |
+static const UChar NEWLINE = 10; |
+ |
+// Empty string |
+static const UChar EMPTY[] = {0}; //"" |
+static const UChar COLON_COLON[] = {0x3A, 0x3A, 0}; //"::" |
+ |
+U_NAMESPACE_BEGIN |
+ |
+const UChar CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass" |
+ |
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator) |
+ |
+/** |
+ * Constructs a new compound transliterator given an array of |
+ * transliterators. The array of transliterators may be of any |
+ * length, including zero or one, however, useful compound |
+ * transliterators have at least two components. |
+ * @param transliterators array of <code>Transliterator</code> |
+ * objects |
+ * @param transliteratorCount The number of |
+ * <code>Transliterator</code> objects in transliterators. |
+ * @param filter the filter. Any character for which |
+ * <tt>filter.contains()</tt> returns <tt>false</tt> will not be |
+ * altered by this transliterator. If <tt>filter</tt> is |
+ * <tt>null</tt> then no filtering is applied. |
+ */ |
+CompoundTransliterator::CompoundTransliterator( |
+ Transliterator* const transliterators[], |
+ int32_t transliteratorCount, |
+ UnicodeFilter* adoptedFilter) : |
+ Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), |
+ trans(0), count(0), numAnonymousRBTs(0) { |
+ setTransliterators(transliterators, transliteratorCount); |
+} |
+ |
+/** |
+ * Splits an ID of the form "ID;ID;..." into a compound using each |
+ * of the IDs. |
+ * @param id of above form |
+ * @param forward if false, does the list in reverse order, and |
+ * takes the inverse of each ID. |
+ */ |
+CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
+ UTransDirection direction, |
+ UnicodeFilter* adoptedFilter, |
+ UParseError& /*parseError*/, |
+ UErrorCode& status) : |
+ Transliterator(id, adoptedFilter), |
+ trans(0), numAnonymousRBTs(0) { |
+ // TODO add code for parseError...currently unused, but |
+ // later may be used by parsing code... |
+ init(id, direction, TRUE, status); |
+} |
+ |
+CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
+ UParseError& /*parseError*/, |
+ UErrorCode& status) : |
+ Transliterator(id, 0), // set filter to 0 here! |
+ trans(0), numAnonymousRBTs(0) { |
+ // TODO add code for parseError...currently unused, but |
+ // later may be used by parsing code... |
+ init(id, UTRANS_FORWARD, TRUE, status); |
+} |
+ |
+ |
+/** |
+ * Private constructor for use of TransliteratorAlias |
+ */ |
+CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID, |
+ UVector& list, |
+ UnicodeFilter* adoptedFilter, |
+ int32_t anonymousRBTs, |
+ UParseError& /*parseError*/, |
+ UErrorCode& status) : |
+ Transliterator(newID, adoptedFilter), |
+ trans(0), numAnonymousRBTs(anonymousRBTs) |
+{ |
+ init(list, UTRANS_FORWARD, FALSE, status); |
+} |
+ |
+/** |
+ * Private constructor for Transliterator from a vector of |
+ * transliterators. The caller is responsible for fixing up the |
+ * ID. |
+ */ |
+CompoundTransliterator::CompoundTransliterator(UVector& list, |
+ UParseError& /*parseError*/, |
+ UErrorCode& status) : |
+ Transliterator(EMPTY, NULL), |
+ trans(0), numAnonymousRBTs(0) |
+{ |
+ // TODO add code for parseError...currently unused, but |
+ // later may be used by parsing code... |
+ init(list, UTRANS_FORWARD, FALSE, status); |
+ // assume caller will fixup ID |
+} |
+ |
+CompoundTransliterator::CompoundTransliterator(UVector& list, |
+ int32_t anonymousRBTs, |
+ UParseError& /*parseError*/, |
+ UErrorCode& status) : |
+ Transliterator(EMPTY, NULL), |
+ trans(0), numAnonymousRBTs(anonymousRBTs) |
+{ |
+ init(list, UTRANS_FORWARD, FALSE, status); |
+} |
+ |
+/** |
+ * Finish constructing a transliterator: only to be called by |
+ * constructors. Before calling init(), set trans and filter to NULL. |
+ * @param id the id containing ';'-separated entries |
+ * @param direction either FORWARD or REVERSE |
+ * @param idSplitPoint the index into id at which the |
+ * adoptedSplitTransliterator should be inserted, if there is one, or |
+ * -1 if there is none. |
+ * @param adoptedSplitTransliterator a transliterator to be inserted |
+ * before the entry at offset idSplitPoint in the id string. May be |
+ * NULL to insert no entry. |
+ * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
+ * entries by calling getID() of component entries. Some constructors |
+ * do not require this because they apply a facade ID anyway. |
+ * @param status the error code indicating success or failure |
+ */ |
+void CompoundTransliterator::init(const UnicodeString& id, |
+ UTransDirection direction, |
+ UBool fixReverseID, |
+ UErrorCode& status) { |
+ // assert(trans == 0); |
+ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ |
+ UVector list(status); |
+ UnicodeSet* compoundFilter = NULL; |
+ UnicodeString regenID; |
+ if (!TransliteratorIDParser::parseCompoundID(id, direction, |
+ regenID, list, compoundFilter)) { |
+ status = U_INVALID_ID; |
+ delete compoundFilter; |
+ return; |
+ } |
+ |
+ TransliteratorIDParser::instantiateList(list, status); |
+ |
+ init(list, direction, fixReverseID, status); |
+ |
+ if (compoundFilter != NULL) { |
+ adoptFilter(compoundFilter); |
+ } |
+} |
+ |
+/** |
+ * Finish constructing a transliterator: only to be called by |
+ * constructors. Before calling init(), set trans and filter to NULL. |
+ * @param list a vector of transliterator objects to be adopted. It |
+ * should NOT be empty. The list should be in declared order. That |
+ * is, it should be in the FORWARD order; if direction is REVERSE then |
+ * the list order will be reversed. |
+ * @param direction either FORWARD or REVERSE |
+ * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
+ * entries by calling getID() of component entries. Some constructors |
+ * do not require this because they apply a facade ID anyway. |
+ * @param status the error code indicating success or failure |
+ */ |
+void CompoundTransliterator::init(UVector& list, |
+ UTransDirection direction, |
+ UBool fixReverseID, |
+ UErrorCode& status) { |
+ // assert(trans == 0); |
+ |
+ // Allocate array |
+ if (U_SUCCESS(status)) { |
+ count = list.size(); |
+ trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *)); |
+ /* test for NULL */ |
+ if (trans == 0) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ } |
+ |
+ if (U_FAILURE(status) || trans == 0) { |
+ // assert(trans == 0); |
+ return; |
+ } |
+ |
+ // Move the transliterators from the vector into an array. |
+ // Reverse the order if necessary. |
+ int32_t i; |
+ for (i=0; i<count; ++i) { |
+ int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i; |
+ trans[i] = (Transliterator*) list.elementAt(j); |
+ } |
+ |
+ // If the direction is UTRANS_REVERSE then we may need to fix the |
+ // ID. |
+ if (direction == UTRANS_REVERSE && fixReverseID) { |
+ UnicodeString newID; |
+ for (i=0; i<count; ++i) { |
+ if (i > 0) { |
+ newID.append(ID_DELIM); |
+ } |
+ newID.append(trans[i]->getID()); |
+ } |
+ setID(newID); |
+ } |
+ |
+ computeMaximumContextLength(); |
+} |
+ |
+/** |
+ * Return the IDs of the given list of transliterators, concatenated |
+ * with ID_DELIM delimiting them. Equivalent to the perlish expression |
+ * join(ID_DELIM, map($_.getID(), transliterators). |
+ */ |
+UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[], |
+ int32_t transCount) { |
+ UnicodeString id; |
+ for (int32_t i=0; i<transCount; ++i) { |
+ if (i > 0) { |
+ id.append(ID_DELIM); |
+ } |
+ id.append(transliterators[i]->getID()); |
+ } |
+ return id; // Return temporary |
+} |
+ |
+/** |
+ * Copy constructor. |
+ */ |
+CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) : |
+ Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) { |
+ *this = t; |
+} |
+ |
+/** |
+ * Destructor |
+ */ |
+CompoundTransliterator::~CompoundTransliterator() { |
+ freeTransliterators(); |
+} |
+ |
+void CompoundTransliterator::freeTransliterators(void) { |
+ if (trans != 0) { |
+ for (int32_t i=0; i<count; ++i) { |
+ delete trans[i]; |
+ } |
+ uprv_free(trans); |
+ } |
+ trans = 0; |
+ count = 0; |
+} |
+ |
+/** |
+ * Assignment operator. |
+ */ |
+CompoundTransliterator& CompoundTransliterator::operator=( |
+ const CompoundTransliterator& t) |
+{ |
+ Transliterator::operator=(t); |
+ int32_t i = 0; |
+ UBool failed = FALSE; |
+ if (trans != NULL) { |
+ for (i=0; i<count; ++i) { |
+ delete trans[i]; |
+ trans[i] = 0; |
+ } |
+ } |
+ if (t.count > count) { |
+ if (trans != NULL) { |
+ uprv_free(trans); |
+ } |
+ trans = (Transliterator **)uprv_malloc(t.count * sizeof(Transliterator *)); |
+ } |
+ count = t.count; |
+ if (trans != NULL) { |
+ for (i=0; i<count; ++i) { |
+ trans[i] = t.trans[i]->clone(); |
+ if (trans[i] == NULL) { |
+ failed = TRUE; |
+ break; |
+ } |
+ } |
+ } |
+ |
+ // if memory allocation failed delete backwards trans array |
+ if (failed && i > 0) { |
+ int32_t n; |
+ for (n = i-1; n >= 0; n--) { |
+ uprv_free(trans[n]); |
+ trans[n] = NULL; |
+ } |
+ } |
+ numAnonymousRBTs = t.numAnonymousRBTs; |
+ return *this; |
+} |
+ |
+/** |
+ * Transliterator API. |
+ */ |
+Transliterator* CompoundTransliterator::clone(void) const { |
+ return new CompoundTransliterator(*this); |
+} |
+ |
+/** |
+ * Returns the number of transliterators in this chain. |
+ * @return number of transliterators in this chain. |
+ */ |
+int32_t CompoundTransliterator::getCount(void) const { |
+ return count; |
+} |
+ |
+/** |
+ * Returns the transliterator at the given index in this chain. |
+ * @param index index into chain, from 0 to <code>getCount() - 1</code> |
+ * @return transliterator at the given index |
+ */ |
+const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const { |
+ return *trans[index]; |
+} |
+ |
+void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[], |
+ int32_t transCount) { |
+ Transliterator** a = (Transliterator **)uprv_malloc(transCount * sizeof(Transliterator *)); |
+ if (a == NULL) { |
+ return; |
+ } |
+ int32_t i = 0; |
+ UBool failed = FALSE; |
+ for (i=0; i<transCount; ++i) { |
+ a[i] = transliterators[i]->clone(); |
+ if (a[i] == NULL) { |
+ failed = TRUE; |
+ break; |
+ } |
+ } |
+ if (failed && i > 0) { |
+ int32_t n; |
+ for (n = i-1; n >= 0; n--) { |
+ uprv_free(a[n]); |
+ a[n] = NULL; |
+ } |
+ return; |
+ } |
+ adoptTransliterators(a, transCount); |
+} |
+ |
+void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[], |
+ int32_t transCount) { |
+ // First free trans[] and set count to zero. Once this is done, |
+ // orphan the filter. Set up the new trans[]. |
+ freeTransliterators(); |
+ trans = adoptedTransliterators; |
+ count = transCount; |
+ computeMaximumContextLength(); |
+ setID(joinIDs(trans, count)); |
+} |
+ |
+/** |
+ * Append c to buf, unless buf is empty or buf already ends in c. |
+ */ |
+static void _smartAppend(UnicodeString& buf, UChar c) { |
+ if (buf.length() != 0 && |
+ buf.charAt(buf.length() - 1) != c) { |
+ buf.append(c); |
+ } |
+} |
+ |
+UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, |
+ UBool escapeUnprintable) const { |
+ // We do NOT call toRules() on our component transliterators, in |
+ // general. If we have several rule-based transliterators, this |
+ // yields a concatenation of the rules -- not what we want. We do |
+ // handle compound RBT transliterators specially -- those for which |
+ // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, |
+ // we do call toRules() recursively. |
+ rulesSource.truncate(0); |
+ if (numAnonymousRBTs >= 1 && getFilter() != NULL) { |
+ // If we are a compound RBT and if we have a global |
+ // filter, then emit it at the top. |
+ UnicodeString pat; |
+ rulesSource.append(COLON_COLON).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM); |
+ } |
+ for (int32_t i=0; i<count; ++i) { |
+ UnicodeString rule; |
+ |
+ // Anonymous RuleBasedTransliterators (inline rules and |
+ // ::BEGIN/::END blocks) are given IDs that begin with |
+ // "%Pass": use toRules() to write all the rules to the output |
+ // (and insert "::Null;" if we have two in a row) |
+ if (trans[i]->getID().startsWith(PASS_STRING)) { |
+ trans[i]->toRules(rule, escapeUnprintable); |
+ if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING)) |
+ rule = UNICODE_STRING_SIMPLE("::Null;") + rule; |
+ |
+ // we also use toRules() on CompoundTransliterators (which we |
+ // check for by looking for a semicolon in the ID)-- this gets |
+ // the list of their child transliterators output in the right |
+ // format |
+ } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) { |
+ trans[i]->toRules(rule, escapeUnprintable); |
+ |
+ // for everything else, use Transliterator::toRules() |
+ } else { |
+ trans[i]->Transliterator::toRules(rule, escapeUnprintable); |
+ } |
+ _smartAppend(rulesSource, NEWLINE); |
+ rulesSource.append(rule); |
+ _smartAppend(rulesSource, ID_DELIM); |
+ } |
+ return rulesSource; |
+} |
+ |
+/** |
+ * Implement Transliterator framework |
+ */ |
+void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
+ UnicodeSet set; |
+ result.clear(); |
+ for (int32_t i=0; i<count; ++i) { |
+ result.addAll(trans[i]->getSourceSet(set)); |
+ // Take the example of Hiragana-Latin. This is really |
+ // Hiragana-Katakana; Katakana-Latin. The source set of |
+ // these two is roughly [:Hiragana:] and [:Katakana:]. |
+ // But the source set for the entire transliterator is |
+ // actually [:Hiragana:] ONLY -- that is, the first |
+ // non-empty source set. |
+ |
+ // This is a heuristic, and not 100% reliable. |
+ if (!result.isEmpty()) { |
+ break; |
+ } |
+ } |
+} |
+ |
+/** |
+ * Override Transliterator framework |
+ */ |
+UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const { |
+ UnicodeSet set; |
+ result.clear(); |
+ for (int32_t i=0; i<count; ++i) { |
+ // This is a heuristic, and not 100% reliable. |
+ result.addAll(trans[i]->getTargetSet(set)); |
+ } |
+ return result; |
+} |
+ |
+/** |
+ * Implements {@link Transliterator#handleTransliterate}. |
+ */ |
+void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
+ UBool incremental) const { |
+ /* Call each transliterator with the same contextStart and |
+ * start, but with the limit as modified |
+ * by preceding transliterators. The start index must be |
+ * reset for each transliterator to give each a chance to |
+ * transliterate the text. The initial contextStart index is known |
+ * to still point to the same place after each transliterator |
+ * is called because each transliterator will not change the |
+ * text between contextStart and the initial start index. |
+ * |
+ * IMPORTANT: After the first transliterator, each subsequent |
+ * transliterator only gets to transliterate text committed by |
+ * preceding transliterators; that is, the start (output |
+ * value) of transliterator i becomes the limit (input value) |
+ * of transliterator i+1. Finally, the overall limit is fixed |
+ * up before we return. |
+ * |
+ * Assumptions we make here: |
+ * (1) contextStart <= start <= limit <= contextLimit <= text.length() |
+ * (2) start <= start' <= limit' ;cursor doesn't move back |
+ * (3) start <= limit' ;text before cursor unchanged |
+ * - start' is the value of start after calling handleKT |
+ * - limit' is the value of limit after calling handleKT |
+ */ |
+ |
+ /** |
+ * Example: 3 transliterators. This example illustrates the |
+ * mechanics we need to implement. C, S, and L are the contextStart, |
+ * start, and limit. gl is the globalLimit. contextLimit is |
+ * equal to limit throughout. |
+ * |
+ * 1. h-u, changes hex to Unicode |
+ * |
+ * 4 7 a d 0 4 7 a |
+ * abc/u0061/u => abca/u |
+ * C S L C S L gl=f->a |
+ * |
+ * 2. upup, changes "x" to "XX" |
+ * |
+ * 4 7 a 4 7 a |
+ * abca/u => abcAA/u |
+ * C SL C S |
+ * L gl=a->b |
+ * 3. u-h, changes Unicode to hex |
+ * |
+ * 4 7 a 4 7 a d 0 3 |
+ * abcAA/u => abc/u0041/u0041/u |
+ * C S L C S |
+ * L gl=b->15 |
+ * 4. return |
+ * |
+ * 4 7 a d 0 3 |
+ * abc/u0041/u0041/u |
+ * C S L |
+ */ |
+ |
+ if (count < 1) { |
+ index.start = index.limit; |
+ return; // Short circuit for empty compound transliterators |
+ } |
+ |
+ // compoundLimit is the limit value for the entire compound |
+ // operation. We overwrite index.limit with the previous |
+ // index.start. After each transliteration, we update |
+ // compoundLimit for insertions or deletions that have happened. |
+ int32_t compoundLimit = index.limit; |
+ |
+ // compoundStart is the start for the entire compound |
+ // operation. |
+ int32_t compoundStart = index.start; |
+ |
+ int32_t delta = 0; // delta in length |
+ |
+ // Give each transliterator a crack at the run of characters. |
+ // See comments at the top of the method for more detail. |
+ for (int32_t i=0; i<count; ++i) { |
+ index.start = compoundStart; // Reset start |
+ int32_t limit = index.limit; |
+ |
+ if (index.start == index.limit) { |
+ // Short circuit for empty range |
+ break; |
+ } |
+ |
+ trans[i]->filteredTransliterate(text, index, incremental); |
+ |
+ // In a properly written transliterator, start == limit after |
+ // handleTransliterate() returns when incremental is false. |
+ // Catch cases where the subclass doesn't do this, and throw |
+ // an exception. (Just pinning start to limit is a bad idea, |
+ // because what's probably happening is that the subclass |
+ // isn't transliterating all the way to the end, and it should |
+ // in non-incremental mode.) |
+ if (!incremental && index.start != index.limit) { |
+ // We can't throw an exception, so just fudge things |
+ index.start = index.limit; |
+ } |
+ |
+ // Cumulative delta for insertions/deletions |
+ delta += index.limit - limit; |
+ |
+ if (incremental) { |
+ // In the incremental case, only allow subsequent |
+ // transliterators to modify what has already been |
+ // completely processed by prior transliterators. In the |
+ // non-incrmental case, allow each transliterator to |
+ // process the entire text. |
+ index.limit = index.start; |
+ } |
+ } |
+ |
+ compoundLimit += delta; |
+ |
+ // Start is good where it is -- where the last transliterator left |
+ // it. Limit needs to be put back where it was, modulo |
+ // adjustments for deletions/insertions. |
+ index.limit = compoundLimit; |
+} |
+ |
+/** |
+ * Sets the length of the longest context required by this transliterator. |
+ * This is <em>preceding</em> context. |
+ */ |
+void CompoundTransliterator::computeMaximumContextLength(void) { |
+ int32_t max = 0; |
+ for (int32_t i=0; i<count; ++i) { |
+ int32_t len = trans[i]->getMaximumContextLength(); |
+ if (len > max) { |
+ max = len; |
+ } |
+ } |
+ setMaximumContextLength(max); |
+} |
+ |
+U_NAMESPACE_END |
+ |
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
+ |
+/* eof */ |
Property changes on: icu46/source/i18n/cpdtrans.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |