icu46/source/i18n/cpdtrans.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/cpdtrans.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 1999-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 11/17/99 aliu Creation.

	8 **********************************************************************

	9 */

	10

	11 #include "unicode/utypes.h"

	12

	13 #if !UCONFIG_NO_TRANSLITERATION

	14

	15 #include "unicode/unifilt.h"

	16 #include "unicode/uniset.h"

	17 #include "cpdtrans.h"

	18 #include "uvector.h"

	19 #include "tridpars.h"

	20 #include "cmemory.h"

	21

	22 // keep in sync with Transliterator

	23 //static const UChar ID_SEP = 0x002D; /-/

	24 static const UChar ID_DELIM = 0x003B; /;/

	25 static const UChar NEWLINE = 10;

	26

	27 // Empty string

	28 static const UChar EMPTY[] = {0}; //""

	29 static const UChar COLON_COLON[] = {0x3A, 0x3A, 0}; //"::"

	30

	31 U_NAMESPACE_BEGIN

	32

	33 const UChar CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x 0073, 0x0073, 0 }; // "%Pass"

	34

	35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator)

	36

	37 /**

	38 * Constructs a new compound transliterator given an array of

	39 * transliterators. The array of transliterators may be of any

	40 * length, including zero or one, however, useful compound

	41 * transliterators have at least two components.

	42 * @param transliterators array of <code>Transliterator</code>

	43 * objects

	44 * @param transliteratorCount The number of

	45 * <code>Transliterator</code> objects in transliterators.

	46 * @param filter the filter. Any character for which

	47 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be

	48 * altered by this transliterator. If <tt>filter</tt> is

	49 * <tt>null</tt> then no filtering is applied.

	50 */

	51 CompoundTransliterator::CompoundTransliterator(

	52 Transliterator* const transliterators[],

	53 int32_t transliteratorCount,

	54 UnicodeFilter* adoptedFilter) :

	55 Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter) ,

	56 trans(0), count(0), numAnonymousRBTs(0) {

	57 setTransliterators(transliterators, transliteratorCount);

	58 }

	59

	60 /**

	61 * Splits an ID of the form "ID;ID;..." into a compound using each

	62 * of the IDs.

	63 * @param id of above form

	64 * @param forward if false, does the list in reverse order, and

	65 * takes the inverse of each ID.

	66 */

	67 CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,

	68 UTransDirection direction,

	69 UnicodeFilter* adoptedFilter,

	70 UParseError& /parseError/,

	71 UErrorCode& status) :

	72 Transliterator(id, adoptedFilter),

	73 trans(0), numAnonymousRBTs(0) {

	74 // TODO add code for parseError...currently unused, but

	75 // later may be used by parsing code...

	76 init(id, direction, TRUE, status);

	77 }

	78

	79 CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,

	80 UParseError& /parseError/,

	81 UErrorCode& status) :

	82 Transliterator(id, 0), // set filter to 0 here!

	83 trans(0), numAnonymousRBTs(0) {

	84 // TODO add code for parseError...currently unused, but

	85 // later may be used by parsing code...

	86 init(id, UTRANS_FORWARD, TRUE, status);

	87 }

	88

	89

	90 /**

	91 * Private constructor for use of TransliteratorAlias

	92 */

	93 CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,

	94 UVector& list,

	95 UnicodeFilter* adoptedFilter,

	96 int32_t anonymousRBTs,

	97 UParseError& /parseError/,

	98 UErrorCode& status) :

	99 Transliterator(newID, adoptedFilter),

	100 trans(0), numAnonymousRBTs(anonymousRBTs)

	101 {

	102 init(list, UTRANS_FORWARD, FALSE, status);

	103 }

	104

	105 /**

	106 * Private constructor for Transliterator from a vector of

	107 * transliterators. The caller is responsible for fixing up the

	108 * ID.

	109 */

	110 CompoundTransliterator::CompoundTransliterator(UVector& list,

	111 UParseError& /parseError/,

	112 UErrorCode& status) :

	113 Transliterator(EMPTY, NULL),

	114 trans(0), numAnonymousRBTs(0)

	115 {

	116 // TODO add code for parseError...currently unused, but

	117 // later may be used by parsing code...

	118 init(list, UTRANS_FORWARD, FALSE, status);

	119 // assume caller will fixup ID

	120 }

	121

	122 CompoundTransliterator::CompoundTransliterator(UVector& list,

	123 int32_t anonymousRBTs,

	124 UParseError& /parseError/,

	125 UErrorCode& status) :

	126 Transliterator(EMPTY, NULL),

	127 trans(0), numAnonymousRBTs(anonymousRBTs)

	128 {

	129 init(list, UTRANS_FORWARD, FALSE, status);

	130 }

	131

	132 /**

	133 * Finish constructing a transliterator: only to be called by

	134 * constructors. Before calling init(), set trans and filter to NULL.

	135 * @param id the id containing ';'-separated entries

	136 * @param direction either FORWARD or REVERSE

	137 * @param idSplitPoint the index into id at which the

	138 * adoptedSplitTransliterator should be inserted, if there is one, or

	139 * -1 if there is none.

	140 * @param adoptedSplitTransliterator a transliterator to be inserted

	141 * before the entry at offset idSplitPoint in the id string. May be

	142 * NULL to insert no entry.

	143 * @param fixReverseID if TRUE, then reconstruct the ID of reverse

	144 * entries by calling getID() of component entries. Some constructors

	145 * do not require this because they apply a facade ID anyway.

	146 * @param status the error code indicating success or failure

	147 */

	148 void CompoundTransliterator::init(const UnicodeString& id,

	149 UTransDirection direction,

	150 UBool fixReverseID,

	151 UErrorCode& status) {

	152 // assert(trans == 0);

	153

	154 if (U_FAILURE(status)) {

	155 return;

	156 }

	157

	158 UVector list(status);

	159 UnicodeSet* compoundFilter = NULL;

	160 UnicodeString regenID;

	161 if (!TransliteratorIDParser::parseCompoundID(id, direction,

	162 regenID, list, compoundFilter)) {

	163 status = U_INVALID_ID;

	164 delete compoundFilter;

	165 return;

	166 }

	167

	168 TransliteratorIDParser::instantiateList(list, status);

	169

	170 init(list, direction, fixReverseID, status);

	171

	172 if (compoundFilter != NULL) {

	173 adoptFilter(compoundFilter);

	174 }

	175 }

	176

	177 /**

	178 * Finish constructing a transliterator: only to be called by

	179 * constructors. Before calling init(), set trans and filter to NULL.

	180 * @param list a vector of transliterator objects to be adopted. It

	181 * should NOT be empty. The list should be in declared order. That

	182 * is, it should be in the FORWARD order; if direction is REVERSE then

	183 * the list order will be reversed.

	184 * @param direction either FORWARD or REVERSE

	185 * @param fixReverseID if TRUE, then reconstruct the ID of reverse

	186 * entries by calling getID() of component entries. Some constructors

	187 * do not require this because they apply a facade ID anyway.

	188 * @param status the error code indicating success or failure

	189 */

	190 void CompoundTransliterator::init(UVector& list,

	191 UTransDirection direction,

	192 UBool fixReverseID,

	193 UErrorCode& status) {

	194 // assert(trans == 0);

	195

	196 // Allocate array

	197 if (U_SUCCESS(status)) {

	198 count = list.size();

	199 trans = (Transliterator *)uprv_malloc(count sizeof(Transliterator *)) ;

	200 /* test for NULL */

	201 if (trans == 0) {

	202 status = U_MEMORY_ALLOCATION_ERROR;

	203 return;

	204 }

	205 }

	206

	207 if (U_FAILURE(status) \|\| trans == 0) {

	208 // assert(trans == 0);

	209 return;

	210 }

	211

	212 // Move the transliterators from the vector into an array.

	213 // Reverse the order if necessary.

	214 int32_t i;

	215 for (i=0; i<count; ++i) {

	216 int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i;

	217 trans[i] = (Transliterator*) list.elementAt(j);

	218 }

	219

	220 // If the direction is UTRANS_REVERSE then we may need to fix the

	221 // ID.

	222 if (direction == UTRANS_REVERSE && fixReverseID) {

	223 UnicodeString newID;

	224 for (i=0; i<count; ++i) {

	225 if (i > 0) {

	226 newID.append(ID_DELIM);

	227 }

	228 newID.append(trans[i]->getID());

	229 }

	230 setID(newID);

	231 }

	232

	233 computeMaximumContextLength();

	234 }

	235

	236 /**

	237 * Return the IDs of the given list of transliterators, concatenated

	238 * with ID_DELIM delimiting them. Equivalent to the perlish expression

	239 * join(ID_DELIM, map($_.getID(), transliterators).

	240 */

	241 UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterat ors[],

	242 int32_t transCount) {

	243 UnicodeString id;

	244 for (int32_t i=0; i<transCount; ++i) {

	245 if (i > 0) {

	246 id.append(ID_DELIM);

	247 }

	248 id.append(transliterators[i]->getID());

	249 }

	250 return id; // Return temporary

	251 }

	252

	253 /**

	254 * Copy constructor.

	255 */

	256 CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :

	257 Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) {

	258 *this = t;

	259 }

	260

	261 /**

	262 * Destructor

	263 */

	264 CompoundTransliterator::~CompoundTransliterator() {

	265 freeTransliterators();

	266 }

	267

	268 void CompoundTransliterator::freeTransliterators(void) {

	269 if (trans != 0) {

	270 for (int32_t i=0; i<count; ++i) {

	271 delete trans[i];

	272 }

	273 uprv_free(trans);

	274 }

	275 trans = 0;

	276 count = 0;

	277 }

	278

	279 /**

	280 * Assignment operator.

	281 */

	282 CompoundTransliterator& CompoundTransliterator::operator=(

	283 const CompoundTransliterator& t)

	284 {

	285 Transliterator::operator=(t);

	286 int32_t i = 0;

	287 UBool failed = FALSE;

	288 if (trans != NULL) {

	289 for (i=0; i<count; ++i) {

	290 delete trans[i];

	291 trans[i] = 0;

	292 }

	293 }

	294 if (t.count > count) {

	295 if (trans != NULL) {

	296 uprv_free(trans);

	297 }

	298 trans = (Transliterator *)uprv_malloc(t.count sizeof(Transliterator * ));

	299 }

	300 count = t.count;

	301 if (trans != NULL) {

	302 for (i=0; i<count; ++i) {

	303 trans[i] = t.trans[i]->clone();

	304 if (trans[i] == NULL) {

	305 failed = TRUE;

	306 break;

	307 }

	308 }

	309 }

	310

	311 // if memory allocation failed delete backwards trans array

	312 if (failed && i > 0) {

	313 int32_t n;

	314 for (n = i-1; n >= 0; n--) {

	315 uprv_free(trans[n]);

	316 trans[n] = NULL;

	317 }

	318 }

	319 numAnonymousRBTs = t.numAnonymousRBTs;

	320 return *this;

	321 }

	322

	323 /**

	324 * Transliterator API.

	325 */

	326 Transliterator* CompoundTransliterator::clone(void) const {

	327 return new CompoundTransliterator(*this);

	328 }

	329

	330 /**

	331 * Returns the number of transliterators in this chain.

	332 * @return number of transliterators in this chain.

	333 */

	334 int32_t CompoundTransliterator::getCount(void) const {

	335 return count;

	336 }

	337

	338 /**

	339 * Returns the transliterator at the given index in this chain.

	340 * @param index index into chain, from 0 to <code>getCount() - 1</code>

	341 * @return transliterator at the given index

	342 */

	343 const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) c onst {

	344 return *trans[index];

	345 }

	346

	347 void CompoundTransliterator::setTransliterators(Transliterator* const transliter ators[],

	348 int32_t transCount) {

	349 Transliterator a = (Transliterator )uprv_malloc(transCount * sizeof(Tran sliterator *));

	350 if (a == NULL) {

	351 return;

	352 }

	353 int32_t i = 0;

	354 UBool failed = FALSE;

	355 for (i=0; i<transCount; ++i) {

	356 a[i] = transliterators[i]->clone();

	357 if (a[i] == NULL) {

	358 failed = TRUE;

	359 break;

	360 }

	361 }

	362 if (failed && i > 0) {

	363 int32_t n;

	364 for (n = i-1; n >= 0; n--) {

	365 uprv_free(a[n]);

	366 a[n] = NULL;

	367 }

	368 return;

	369 }

	370 adoptTransliterators(a, transCount);

	371 }

	372

	373 void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransli terators[],

	374 int32_t transCount) {

	375 // First free trans[] and set count to zero. Once this is done,

	376 // orphan the filter. Set up the new trans[].

	377 freeTransliterators();

	378 trans = adoptedTransliterators;

	379 count = transCount;

	380 computeMaximumContextLength();

	381 setID(joinIDs(trans, count));

	382 }

	383

	384 /**

	385 * Append c to buf, unless buf is empty or buf already ends in c.

	386 */

	387 static void _smartAppend(UnicodeString& buf, UChar c) {

	388 if (buf.length() != 0 &&

	389 buf.charAt(buf.length() - 1) != c) {

	390 buf.append(c);

	391 }

	392 }

	393

	394 UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,

	395 UBool escapeUnprintable) const {

	396 // We do NOT call toRules() on our component transliterators, in

	397 // general. If we have several rule-based transliterators, this

	398 // yields a concatenation of the rules -- not what we want. We do

	399 // handle compound RBT transliterators specially -- those for which

	400 // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,

	401 // we do call toRules() recursively.

	402 rulesSource.truncate(0);

	403 if (numAnonymousRBTs >= 1 && getFilter() != NULL) {

	404 // If we are a compound RBT and if we have a global

	405 // filter, then emit it at the top.

	406 UnicodeString pat;

	407 rulesSource.append(COLON_COLON).append(getFilter()->toPattern(pat, escap eUnprintable)).append(ID_DELIM);

	408 }

	409 for (int32_t i=0; i<count; ++i) {

	410 UnicodeString rule;

	411

	412 // Anonymous RuleBasedTransliterators (inline rules and

	413 // ::BEGIN/::END blocks) are given IDs that begin with

	414 // "%Pass": use toRules() to write all the rules to the output

	415 // (and insert "::Null;" if we have two in a row)

	416 if (trans[i]->getID().startsWith(PASS_STRING)) {

	417 trans[i]->toRules(rule, escapeUnprintable);

	418 if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWit h(PASS_STRING))

	419 rule = UNICODE_STRING_SIMPLE("::Null;") + rule;

	420

	421 // we also use toRules() on CompoundTransliterators (which we

	422 // check for by looking for a semicolon in the ID)-- this gets

	423 // the list of their child transliterators output in the right

	424 // format

	425 } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) {

	426 trans[i]->toRules(rule, escapeUnprintable);

	427

	428 // for everything else, use Transliterator::toRules()

	429 } else {

	430 trans[i]->Transliterator::toRules(rule, escapeUnprintable);

	431 }

	432 _smartAppend(rulesSource, NEWLINE);

	433 rulesSource.append(rule);

	434 _smartAppend(rulesSource, ID_DELIM);

	435 }

	436 return rulesSource;

	437 }

	438

	439 /**

	440 * Implement Transliterator framework

	441 */

	442 void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const {

	443 UnicodeSet set;

	444 result.clear();

	445 for (int32_t i=0; i<count; ++i) {

	446 result.addAll(trans[i]->getSourceSet(set));

	447 // Take the example of Hiragana-Latin. This is really

	448 // Hiragana-Katakana; Katakana-Latin. The source set of

	449 // these two is roughly [:Hiragana:] and [:Katakana:].

	450 // But the source set for the entire transliterator is

	451 // actually [:Hiragana:] ONLY -- that is, the first

	452 // non-empty source set.

	453

	454 // This is a heuristic, and not 100% reliable.

	455 if (!result.isEmpty()) {

	456 break;

	457 }

	458 }

	459 }

	460

	461 /**

	462 * Override Transliterator framework

	463 */

	464 UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const {

	465 UnicodeSet set;

	466 result.clear();

	467 for (int32_t i=0; i<count; ++i) {

	468 // This is a heuristic, and not 100% reliable.

	469 result.addAll(trans[i]->getTargetSet(set));

	470 }

	471 return result;

	472 }

	473

	474 /**

	475 * Implements {@link Transliterator#handleTransliterate}.

	476 */

	477 void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPositi on& index,

	478 UBool incremental) const {

	479 /* Call each transliterator with the same contextStart and

	480 * start, but with the limit as modified

	481 * by preceding transliterators. The start index must be

	482 * reset for each transliterator to give each a chance to

	483 * transliterate the text. The initial contextStart index is known

	484 * to still point to the same place after each transliterator

	485 * is called because each transliterator will not change the

	486 * text between contextStart and the initial start index.

	487 *

	488 * IMPORTANT: After the first transliterator, each subsequent

	489 * transliterator only gets to transliterate text committed by

	490 * preceding transliterators; that is, the start (output

	491 * value) of transliterator i becomes the limit (input value)

	492 * of transliterator i+1. Finally, the overall limit is fixed

	493 * up before we return.

	494 *

	495 * Assumptions we make here:

	496 * (1) contextStart <= start <= limit <= contextLimit <= text.length()

	497 * (2) start <= start' <= limit' ;cursor doesn't move back

	498 * (3) start <= limit' ;text before cursor unchanged

	499 * - start' is the value of start after calling handleKT

	500 * - limit' is the value of limit after calling handleKT

	501 */

	502

	503 /**

	504 * Example: 3 transliterators. This example illustrates the

	505 * mechanics we need to implement. C, S, and L are the contextStart,

	506 * start, and limit. gl is the globalLimit. contextLimit is

	507 * equal to limit throughout.

	508 *

	509 * 1. h-u, changes hex to Unicode

	510 *

	511 * 4 7 a d 0 4 7 a

	512 * abc/u0061/u => abca/u

	513 * C S L C S L gl=f->a

	514 *

	515 * 2. upup, changes "x" to "XX"

	516 *

	517 * 4 7 a 4 7 a

	518 * abca/u => abcAA/u

	519 * C SL C S

	520 * L gl=a->b

	521 * 3. u-h, changes Unicode to hex

	522 *

	523 * 4 7 a 4 7 a d 0 3

	524 * abcAA/u => abc/u0041/u0041/u

	525 * C S L C S

	526 * L gl=b->15

	527 * 4. return

	528 *

	529 * 4 7 a d 0 3

	530 * abc/u0041/u0041/u

	531 * C S L

	532 */

	533

	534 if (count < 1) {

	535 index.start = index.limit;

	536 return; // Short circuit for empty compound transliterators

	537 }

	538

	539 // compoundLimit is the limit value for the entire compound

	540 // operation. We overwrite index.limit with the previous

	541 // index.start. After each transliteration, we update

	542 // compoundLimit for insertions or deletions that have happened.

	543 int32_t compoundLimit = index.limit;

	544

	545 // compoundStart is the start for the entire compound

	546 // operation.

	547 int32_t compoundStart = index.start;

	548

	549 int32_t delta = 0; // delta in length

	550

	551 // Give each transliterator a crack at the run of characters.

	552 // See comments at the top of the method for more detail.

	553 for (int32_t i=0; i<count; ++i) {

	554 index.start = compoundStart; // Reset start

	555 int32_t limit = index.limit;

	556

	557 if (index.start == index.limit) {

	558 // Short circuit for empty range

	559 break;

	560 }

	561

	562 trans[i]->filteredTransliterate(text, index, incremental);

	563

	564 // In a properly written transliterator, start == limit after

	565 // handleTransliterate() returns when incremental is false.

	566 // Catch cases where the subclass doesn't do this, and throw

	567 // an exception. (Just pinning start to limit is a bad idea,

	568 // because what's probably happening is that the subclass

	569 // isn't transliterating all the way to the end, and it should

	570 // in non-incremental mode.)

	571 if (!incremental && index.start != index.limit) {

	572 // We can't throw an exception, so just fudge things

	573 index.start = index.limit;

	574 }

	575

	576 // Cumulative delta for insertions/deletions

	577 delta += index.limit - limit;

	578

	579 if (incremental) {

	580 // In the incremental case, only allow subsequent

	581 // transliterators to modify what has already been

	582 // completely processed by prior transliterators. In the

	583 // non-incrmental case, allow each transliterator to

	584 // process the entire text.

	585 index.limit = index.start;

	586 }

	587 }

	588

	589 compoundLimit += delta;

	590

	591 // Start is good where it is -- where the last transliterator left

	592 // it. Limit needs to be put back where it was, modulo

	593 // adjustments for deletions/insertions.

	594 index.limit = compoundLimit;

	595 }

	596

	597 /**

	598 * Sets the length of the longest context required by this transliterator.

	599 * This is <em>preceding</em> context.

	600 */

	601 void CompoundTransliterator::computeMaximumContextLength(void) {

	602 int32_t max = 0;

	603 for (int32_t i=0; i<count; ++i) {

	604 int32_t len = trans[i]->getMaximumContextLength();

	605 if (len > max) {

	606 max = len;

	607 }

	608 }

	609 setMaximumContextLength(max);

	610 }

	611

	612 U_NAMESPACE_END

	613

	614 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

	615

	616 /* eof */

OLD	NEW

« no previous file with comments | « icu46/source/i18n/cpdtrans.h ('k') | icu46/source/i18n/csdetect.h » ('j') | no next file with comments »