icu46/source/i18n/cpdtrans.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/cpdtrans.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/cpdtrans.cpp

===================================================================

--- icu46/source/i18n/cpdtrans.cpp (revision 0)

+++ icu46/source/i18n/cpdtrans.cpp (revision 0)

@@ -0,0 +1,616 @@

+/*

+**********************************************************************

+* Date Name Description

+* 11/17/99 aliu Creation.

+**********************************************************************

+*/

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_TRANSLITERATION

+#include "unicode/unifilt.h"

+#include "unicode/uniset.h"

+#include "cpdtrans.h"

+#include "uvector.h"

+#include "tridpars.h"

+#include "cmemory.h"

+// keep in sync with Transliterator

+//static const UChar ID_SEP = 0x002D; /*-*/

+static const UChar ID_DELIM = 0x003B; /*;*/

+static const UChar NEWLINE = 10;

+// Empty string

+static const UChar EMPTY[] = {0}; //""

+static const UChar COLON_COLON[] = {0x3A, 0x3A, 0}; //"::"

+U_NAMESPACE_BEGIN

+const UChar CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass"

+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator)

+/**

+ * Constructs a new compound transliterator given an array of

+ * transliterators. The array of transliterators may be of any

+ * length, including zero or one, however, useful compound

+ * transliterators have at least two components.

+ * @param transliterators array of <code>Transliterator</code>

+ * objects

+ * @param transliteratorCount The number of

+ * <code>Transliterator</code> objects in transliterators.

+ * @param filter the filter. Any character for which

+ * <tt>filter.contains()</tt> returns <tt>false</tt> will not be

+ * altered by this transliterator. If <tt>filter</tt> is

+ * <tt>null</tt> then no filtering is applied.

+ */

+CompoundTransliterator::CompoundTransliterator(

+ Transliterator* const transliterators[],

+ int32_t transliteratorCount,

+ UnicodeFilter* adoptedFilter) :

+ Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter),

+ trans(0), count(0), numAnonymousRBTs(0) {

+ setTransliterators(transliterators, transliteratorCount);

+/**

+ * Splits an ID of the form "ID;ID;..." into a compound using each

+ * of the IDs.

+ * @param id of above form

+ * @param forward if false, does the list in reverse order, and

+ * takes the inverse of each ID.

+ */

+CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,

+ UTransDirection direction,

+ UnicodeFilter* adoptedFilter,

+ UParseError& /*parseError*/,

+ UErrorCode& status) :

+ Transliterator(id, adoptedFilter),

+ trans(0), numAnonymousRBTs(0) {

+ // TODO add code for parseError...currently unused, but

+ // later may be used by parsing code...

+ init(id, direction, TRUE, status);

+CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,

+ UParseError& /*parseError*/,

+ UErrorCode& status) :

+ Transliterator(id, 0), // set filter to 0 here!

+ trans(0), numAnonymousRBTs(0) {

+ // TODO add code for parseError...currently unused, but

+ // later may be used by parsing code...

+ init(id, UTRANS_FORWARD, TRUE, status);

+/**

+ * Private constructor for use of TransliteratorAlias

+ */

+CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,

+ UVector& list,

+ UnicodeFilter* adoptedFilter,

+ int32_t anonymousRBTs,

+ UParseError& /*parseError*/,

+ UErrorCode& status) :

+ Transliterator(newID, adoptedFilter),

+ trans(0), numAnonymousRBTs(anonymousRBTs)

+ init(list, UTRANS_FORWARD, FALSE, status);

+/**

+ * Private constructor for Transliterator from a vector of

+ * transliterators. The caller is responsible for fixing up the

+ * ID.

+ */

+CompoundTransliterator::CompoundTransliterator(UVector& list,

+ UParseError& /*parseError*/,

+ UErrorCode& status) :

+ Transliterator(EMPTY, NULL),

+ trans(0), numAnonymousRBTs(0)

+ // TODO add code for parseError...currently unused, but

+ // later may be used by parsing code...

+ init(list, UTRANS_FORWARD, FALSE, status);

+ // assume caller will fixup ID

+CompoundTransliterator::CompoundTransliterator(UVector& list,

+ int32_t anonymousRBTs,

+ UParseError& /*parseError*/,

+ UErrorCode& status) :

+ Transliterator(EMPTY, NULL),

+ trans(0), numAnonymousRBTs(anonymousRBTs)

+ init(list, UTRANS_FORWARD, FALSE, status);

+/**

+ * Finish constructing a transliterator: only to be called by

+ * constructors. Before calling init(), set trans and filter to NULL.

+ * @param id the id containing ';'-separated entries

+ * @param direction either FORWARD or REVERSE

+ * @param idSplitPoint the index into id at which the

+ * adoptedSplitTransliterator should be inserted, if there is one, or

+ * -1 if there is none.

+ * @param adoptedSplitTransliterator a transliterator to be inserted

+ * before the entry at offset idSplitPoint in the id string. May be

+ * NULL to insert no entry.

+ * @param fixReverseID if TRUE, then reconstruct the ID of reverse

+ * entries by calling getID() of component entries. Some constructors

+ * do not require this because they apply a facade ID anyway.

+ * @param status the error code indicating success or failure

+ */

+void CompoundTransliterator::init(const UnicodeString& id,

+ UTransDirection direction,

+ UBool fixReverseID,

+ UErrorCode& status) {

+ // assert(trans == 0);

+ if (U_FAILURE(status)) {

+ return;

+ }

+ UVector list(status);

+ UnicodeSet* compoundFilter = NULL;

+ UnicodeString regenID;

+ if (!TransliteratorIDParser::parseCompoundID(id, direction,

+ regenID, list, compoundFilter)) {

+ status = U_INVALID_ID;

+ delete compoundFilter;

+ return;

+ }

+ TransliteratorIDParser::instantiateList(list, status);

+ init(list, direction, fixReverseID, status);

+ if (compoundFilter != NULL) {

+ adoptFilter(compoundFilter);

+ }

+/**

+ * Finish constructing a transliterator: only to be called by

+ * constructors. Before calling init(), set trans and filter to NULL.

+ * @param list a vector of transliterator objects to be adopted. It

+ * should NOT be empty. The list should be in declared order. That

+ * is, it should be in the FORWARD order; if direction is REVERSE then

+ * the list order will be reversed.

+ * @param direction either FORWARD or REVERSE

+ * @param fixReverseID if TRUE, then reconstruct the ID of reverse

+ * entries by calling getID() of component entries. Some constructors

+ * do not require this because they apply a facade ID anyway.

+ * @param status the error code indicating success or failure

+ */

+void CompoundTransliterator::init(UVector& list,

+ UTransDirection direction,

+ UBool fixReverseID,

+ UErrorCode& status) {

+ // assert(trans == 0);

+ // Allocate array

+ if (U_SUCCESS(status)) {

+ count = list.size();

+ trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *));

+ /* test for NULL */

+ if (trans == 0) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ if (U_FAILURE(status) || trans == 0) {

+ // assert(trans == 0);

+ return;

+ }

+ // Move the transliterators from the vector into an array.

+ // Reverse the order if necessary.

+ int32_t i;

+ for (i=0; i<count; ++i) {

+ int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i;

+ trans[i] = (Transliterator*) list.elementAt(j);

+ }

+ // If the direction is UTRANS_REVERSE then we may need to fix the

+ // ID.

+ if (direction == UTRANS_REVERSE && fixReverseID) {

+ UnicodeString newID;

+ for (i=0; i<count; ++i) {

+ if (i > 0) {

+ newID.append(ID_DELIM);

+ }

+ newID.append(trans[i]->getID());

+ }

+ setID(newID);

+ }

+ computeMaximumContextLength();

+/**

+ * Return the IDs of the given list of transliterators, concatenated

+ * with ID_DELIM delimiting them. Equivalent to the perlish expression

+ * join(ID_DELIM, map($_.getID(), transliterators).

+ */

+UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[],

+ int32_t transCount) {

+ UnicodeString id;

+ for (int32_t i=0; i<transCount; ++i) {

+ if (i > 0) {

+ id.append(ID_DELIM);

+ }

+ id.append(transliterators[i]->getID());

+ }

+ return id; // Return temporary

+/**

+ * Copy constructor.

+ */

+CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :

+ Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) {

+ *this = t;

+/**

+ * Destructor

+ */

+CompoundTransliterator::~CompoundTransliterator() {

+ freeTransliterators();

+void CompoundTransliterator::freeTransliterators(void) {

+ if (trans != 0) {

+ for (int32_t i=0; i<count; ++i) {

+ delete trans[i];

+ }

+ uprv_free(trans);

+ }

+ trans = 0;

+ count = 0;

+/**

+ * Assignment operator.

+ */

+CompoundTransliterator& CompoundTransliterator::operator=(

+ const CompoundTransliterator& t)

+ Transliterator::operator=(t);

+ int32_t i = 0;

+ UBool failed = FALSE;

+ if (trans != NULL) {

+ for (i=0; i<count; ++i) {

+ delete trans[i];

+ trans[i] = 0;

+ }

+ if (t.count > count) {

+ if (trans != NULL) {

+ uprv_free(trans);

+ }

+ trans = (Transliterator **)uprv_malloc(t.count * sizeof(Transliterator *));

+ }

+ count = t.count;

+ if (trans != NULL) {

+ for (i=0; i<count; ++i) {

+ trans[i] = t.trans[i]->clone();

+ if (trans[i] == NULL) {

+ failed = TRUE;

+ break;

+ }

+ // if memory allocation failed delete backwards trans array

+ if (failed && i > 0) {

+ int32_t n;

+ for (n = i-1; n >= 0; n--) {

+ uprv_free(trans[n]);

+ trans[n] = NULL;

+ }

+ numAnonymousRBTs = t.numAnonymousRBTs;

+ return *this;

+/**

+ * Transliterator API.

+ */

+Transliterator* CompoundTransliterator::clone(void) const {

+ return new CompoundTransliterator(*this);

+/**

+ * Returns the number of transliterators in this chain.

+ * @return number of transliterators in this chain.

+ */

+int32_t CompoundTransliterator::getCount(void) const {

+ return count;

+/**

+ * Returns the transliterator at the given index in this chain.

+ * @param index index into chain, from 0 to <code>getCount() - 1</code>

+ * @return transliterator at the given index

+ */

+const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const {

+ return *trans[index];

+void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[],

+ int32_t transCount) {

+ Transliterator** a = (Transliterator **)uprv_malloc(transCount * sizeof(Transliterator *));

+ if (a == NULL) {

+ return;

+ }

+ int32_t i = 0;

+ UBool failed = FALSE;

+ for (i=0; i<transCount; ++i) {

+ a[i] = transliterators[i]->clone();

+ if (a[i] == NULL) {

+ failed = TRUE;

+ break;

+ }

+ if (failed && i > 0) {

+ int32_t n;

+ for (n = i-1; n >= 0; n--) {

+ uprv_free(a[n]);

+ a[n] = NULL;

+ }

+ return;

+ }

+ adoptTransliterators(a, transCount);

+void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[],

+ int32_t transCount) {

+ // First free trans[] and set count to zero. Once this is done,

+ // orphan the filter. Set up the new trans[].

+ freeTransliterators();

+ trans = adoptedTransliterators;

+ count = transCount;

+ computeMaximumContextLength();

+ setID(joinIDs(trans, count));

+/**

+ * Append c to buf, unless buf is empty or buf already ends in c.

+ */

+static void _smartAppend(UnicodeString& buf, UChar c) {

+ if (buf.length() != 0 &&

+ buf.charAt(buf.length() - 1) != c) {

+ buf.append(c);

+ }

+UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,

+ UBool escapeUnprintable) const {

+ // We do NOT call toRules() on our component transliterators, in

+ // general. If we have several rule-based transliterators, this

+ // yields a concatenation of the rules -- not what we want. We do

+ // handle compound RBT transliterators specially -- those for which

+ // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,

+ // we do call toRules() recursively.

+ rulesSource.truncate(0);

+ if (numAnonymousRBTs >= 1 && getFilter() != NULL) {

+ // If we are a compound RBT and if we have a global

+ // filter, then emit it at the top.

+ UnicodeString pat;

+ rulesSource.append(COLON_COLON).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM);

+ }

+ for (int32_t i=0; i<count; ++i) {

+ UnicodeString rule;

+ // Anonymous RuleBasedTransliterators (inline rules and

+ // ::BEGIN/::END blocks) are given IDs that begin with

+ // "%Pass": use toRules() to write all the rules to the output

+ // (and insert "::Null;" if we have two in a row)

+ if (trans[i]->getID().startsWith(PASS_STRING)) {

+ trans[i]->toRules(rule, escapeUnprintable);

+ if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING))

+ rule = UNICODE_STRING_SIMPLE("::Null;") + rule;

+ // we also use toRules() on CompoundTransliterators (which we

+ // check for by looking for a semicolon in the ID)-- this gets

+ // the list of their child transliterators output in the right

+ // format

+ } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) {

+ trans[i]->toRules(rule, escapeUnprintable);

+ // for everything else, use Transliterator::toRules()

+ } else {

+ trans[i]->Transliterator::toRules(rule, escapeUnprintable);

+ }

+ _smartAppend(rulesSource, NEWLINE);

+ rulesSource.append(rule);

+ _smartAppend(rulesSource, ID_DELIM);

+ }

+ return rulesSource;

+/**

+ * Implement Transliterator framework

+ */

+void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const {

+ UnicodeSet set;

+ result.clear();

+ for (int32_t i=0; i<count; ++i) {

+ result.addAll(trans[i]->getSourceSet(set));

+ // Take the example of Hiragana-Latin. This is really

+ // Hiragana-Katakana; Katakana-Latin. The source set of

+ // these two is roughly [:Hiragana:] and [:Katakana:].

+ // But the source set for the entire transliterator is

+ // actually [:Hiragana:] ONLY -- that is, the first

+ // non-empty source set.

+ // This is a heuristic, and not 100% reliable.

+ if (!result.isEmpty()) {

+ break;

+ }

+/**

+ * Override Transliterator framework

+ */

+UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const {

+ UnicodeSet set;

+ result.clear();

+ for (int32_t i=0; i<count; ++i) {

+ // This is a heuristic, and not 100% reliable.

+ result.addAll(trans[i]->getTargetSet(set));

+ }

+ return result;

+/**

+ * Implements {@link Transliterator#handleTransliterate}.

+ */

+void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,

+ UBool incremental) const {

+ /* Call each transliterator with the same contextStart and

+ * start, but with the limit as modified

+ * by preceding transliterators. The start index must be

+ * reset for each transliterator to give each a chance to

+ * transliterate the text. The initial contextStart index is known

+ * to still point to the same place after each transliterator

+ * is called because each transliterator will not change the

+ * text between contextStart and the initial start index.

+ *

+ * IMPORTANT: After the first transliterator, each subsequent

+ * transliterator only gets to transliterate text committed by

+ * preceding transliterators; that is, the start (output

+ * value) of transliterator i becomes the limit (input value)

+ * of transliterator i+1. Finally, the overall limit is fixed

+ * up before we return.

+ *

+ * Assumptions we make here:

+ * (1) contextStart <= start <= limit <= contextLimit <= text.length()

+ * (2) start <= start' <= limit' ;cursor doesn't move back

+ * (3) start <= limit' ;text before cursor unchanged

+ * - start' is the value of start after calling handleKT

+ * - limit' is the value of limit after calling handleKT

+ */

+ /**

+ * Example: 3 transliterators. This example illustrates the

+ * mechanics we need to implement. C, S, and L are the contextStart,

+ * start, and limit. gl is the globalLimit. contextLimit is

+ * equal to limit throughout.

+ *

+ * 1. h-u, changes hex to Unicode

+ *

+ * 4 7 a d 0 4 7 a

+ * abc/u0061/u => abca/u

+ * C S L C S L gl=f->a

+ *

+ * 2. upup, changes "x" to "XX"

+ *

+ * 4 7 a 4 7 a

+ * abca/u => abcAA/u

+ * C SL C S

+ * L gl=a->b

+ * 3. u-h, changes Unicode to hex

+ *

+ * 4 7 a 4 7 a d 0 3

+ * abcAA/u => abc/u0041/u0041/u

+ * C S L C S

+ * L gl=b->15

+ * 4. return

+ *

+ * 4 7 a d 0 3

+ * abc/u0041/u0041/u

+ * C S L

+ */

+ if (count < 1) {

+ index.start = index.limit;

+ return; // Short circuit for empty compound transliterators

+ }

+ // compoundLimit is the limit value for the entire compound

+ // operation. We overwrite index.limit with the previous

+ // index.start. After each transliteration, we update

+ // compoundLimit for insertions or deletions that have happened.

+ int32_t compoundLimit = index.limit;

+ // compoundStart is the start for the entire compound

+ // operation.

+ int32_t compoundStart = index.start;

+ int32_t delta = 0; // delta in length

+ // Give each transliterator a crack at the run of characters.

+ // See comments at the top of the method for more detail.

+ for (int32_t i=0; i<count; ++i) {

+ index.start = compoundStart; // Reset start

+ int32_t limit = index.limit;

+ if (index.start == index.limit) {

+ // Short circuit for empty range

+ break;

+ }

+ trans[i]->filteredTransliterate(text, index, incremental);

+ // In a properly written transliterator, start == limit after

+ // handleTransliterate() returns when incremental is false.

+ // Catch cases where the subclass doesn't do this, and throw

+ // an exception. (Just pinning start to limit is a bad idea,

+ // because what's probably happening is that the subclass

+ // isn't transliterating all the way to the end, and it should

+ // in non-incremental mode.)

+ if (!incremental && index.start != index.limit) {

+ // We can't throw an exception, so just fudge things

+ index.start = index.limit;

+ }

+ // Cumulative delta for insertions/deletions

+ delta += index.limit - limit;

+ if (incremental) {

+ // In the incremental case, only allow subsequent

+ // transliterators to modify what has already been

+ // completely processed by prior transliterators. In the

+ // non-incrmental case, allow each transliterator to

+ // process the entire text.

+ index.limit = index.start;

+ }

+ compoundLimit += delta;

+ // Start is good where it is -- where the last transliterator left

+ // it. Limit needs to be put back where it was, modulo

+ // adjustments for deletions/insertions.

+ index.limit = compoundLimit;

+/**

+ * Sets the length of the longest context required by this transliterator.

+ * This is <em>preceding</em> context.

+ */

+void CompoundTransliterator::computeMaximumContextLength(void) {

+ int32_t max = 0;

+ for (int32_t i=0; i<count; ++i) {

+ int32_t len = trans[i]->getMaximumContextLength();

+ if (len > max) {

+ max = len;

+ }

+ setMaximumContextLength(max);

+U_NAMESPACE_END

+#endif /* #if !UCONFIG_NO_TRANSLITERATION */

+/* eof */

Property changes on: icu46/source/i18n/cpdtrans.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/cpdtrans.h ('k') | icu46/source/i18n/csdetect.h » ('j') | no next file with comments »