icu46/source/i18n/rbt_rule.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/rbt_rule.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 1999-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 11/17/99 aliu Creation.

	8 **********************************************************************

	9 */

	10

	11 #include "unicode/utypes.h"

	12

	13 #if !UCONFIG_NO_TRANSLITERATION

	14

	15 #include "unicode/rep.h"

	16 #include "unicode/unifilt.h"

	17 #include "unicode/uniset.h"

	18 #include "rbt_rule.h"

	19 #include "rbt_data.h"

	20 #include "cmemory.h"

	21 #include "strmatch.h"

	22 #include "strrepl.h"

	23 #include "util.h"

	24 #include "putilimp.h"

	25

	26 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "

	27

	28 U_NAMESPACE_BEGIN

	29

	30 /**

	31 * Construct a new rule with the given input, output text, and other

	32 * attributes. A cursor position may be specified for the output text.

	33 * @param input input string, including key and optional ante and

	34 * post context

	35 * @param anteContextPos offset into input to end of ante context, or -1 if

	36 * none. Must be <= input.length() if not -1.

	37 * @param postContextPos offset into input to start of post context, or -1

	38 * if none. Must be <= input.length() if not -1, and must be >=

	39 * anteContextPos.

	40 * @param output output string

	41 * @param cursorPosition offset into output at which cursor is located, or -1 if

	42 * none. If less than zero, then the cursor is placed after the

	43 * <code>output</code>; that is, -1 is equivalent to

	44 * <code>output.length()</code>. If greater than

	45 * <code>output.length()</code> then an exception is thrown.

	46 * @param segs array of UnicodeFunctors corresponding to input pattern

	47 * segments, or null if there are none. The array itself is adopted,

	48 * but the pointers within it are not.

	49 * @param segsCount number of elements in segs[]

	50 * @param anchorStart TRUE if the the rule is anchored on the left to

	51 * the context start

	52 * @param anchorEnd TRUE if the rule is anchored on the right to the

	53 * context limit

	54 */

	55 TransliterationRule::TransliterationRule(const UnicodeString& input,

	56 int32_t anteContextPos, int32_t postCon textPos,

	57 const UnicodeString& outputStr,

	58 int32_t cursorPosition, int32_t cursorO ffset,

	59 UnicodeFunctor** segs,

	60 int32_t segsCount,

	61 UBool anchorStart, UBool anchorEnd,

	62 const TransliterationRuleData* theData,

	63 UErrorCode& status) :

	64 UMemory(),

	65 segments(0),

	66 data(theData) {

	67

	68 if (U_FAILURE(status)) {

	69 return;

	70 }

	71 // Do range checks only when warranted to save time

	72 if (anteContextPos < 0) {

	73 anteContextLength = 0;

	74 } else {

	75 if (anteContextPos > input.length()) {

	76 // throw new IllegalArgumentException("Invalid ante context");

	77 status = U_ILLEGAL_ARGUMENT_ERROR;

	78 return;

	79 }

	80 anteContextLength = anteContextPos;

	81 }

	82 if (postContextPos < 0) {

	83 keyLength = input.length() - anteContextLength;

	84 } else {

	85 if (postContextPos < anteContextLength \|\|

	86 postContextPos > input.length()) {

	87 // throw new IllegalArgumentException("Invalid post context");

	88 status = U_ILLEGAL_ARGUMENT_ERROR;

	89 return;

	90 }

	91 keyLength = postContextPos - anteContextLength;

	92 }

	93 if (cursorPosition < 0) {

	94 cursorPosition = outputStr.length();

	95 } else if (cursorPosition > outputStr.length()) {

	96 // throw new IllegalArgumentException("Invalid cursor position");

	97 status = U_ILLEGAL_ARGUMENT_ERROR;

	98 return;

	99 }

	100 // We don't validate the segments array. The caller must

	101 // guarantee that the segments are well-formed (that is, that

	102 // all $n references in the output refer to indices of this

	103 // array, and that no array elements are null).

	104 this->segments = segs;

	105 this->segmentsCount = segsCount;

	106

	107 pattern = input;

	108 flags = 0;

	109 if (anchorStart) {

	110 flags \|= ANCHOR_START;

	111 }

	112 if (anchorEnd) {

	113 flags \|= ANCHOR_END;

	114 }

	115

	116 anteContext = NULL;

	117 if (anteContextLength > 0) {

	118 anteContext = new StringMatcher(pattern, 0, anteContextLength,

	119 FALSE, *data);

	120 /* test for NULL */

	121 if (anteContext == 0) {

	122 status = U_MEMORY_ALLOCATION_ERROR;

	123 return;

	124 }

	125 }

	126

	127 key = NULL;

	128 if (keyLength > 0) {

	129 key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,

	130 FALSE, *data);

	131 /* test for NULL */

	132 if (key == 0) {

	133 status = U_MEMORY_ALLOCATION_ERROR;

	134 return;

	135 }

	136 }

	137

	138 int32_t postContextLength = pattern.length() - keyLength - anteContextLength ;

	139 postContext = NULL;

	140 if (postContextLength > 0) {

	141 postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),

	142 FALSE, *data);

	143 /* test for NULL */

	144 if (postContext == 0) {

	145 status = U_MEMORY_ALLOCATION_ERROR;

	146 return;

	147 }

	148 }

	149

	150 this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);

	151 /* test for NULL */

	152 if (this->output == 0) {

	153 status = U_MEMORY_ALLOCATION_ERROR;

	154 return;

	155 }

	156 }

	157

	158 /**

	159 * Copy constructor.

	160 */

	161 TransliterationRule::TransliterationRule(TransliterationRule& other) :

	162 UMemory(other),

	163 anteContext(NULL),

	164 key(NULL),

	165 postContext(NULL),

	166 pattern(other.pattern),

	167 anteContextLength(other.anteContextLength),

	168 keyLength(other.keyLength),

	169 flags(other.flags),

	170 data(other.data) {

	171

	172 segments = NULL;

	173 segmentsCount = 0;

	174 if (other.segmentsCount > 0) {

	175 segments = (UnicodeFunctor *)uprv_malloc(other.segmentsCount sizeof(U nicodeFunctor *));

	176 uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segment s[0]));

	177 }

	178

	179 if (other.anteContext != NULL) {

	180 anteContext = (StringMatcher*) other.anteContext->clone();

	181 }

	182 if (other.key != NULL) {

	183 key = (StringMatcher*) other.key->clone();

	184 }

	185 if (other.postContext != NULL) {

	186 postContext = (StringMatcher*) other.postContext->clone();

	187 }

	188 output = other.output->clone();

	189 }

	190

	191 TransliterationRule::~TransliterationRule() {

	192 uprv_free(segments);

	193 delete anteContext;

	194 delete key;

	195 delete postContext;

	196 delete output;

	197 }

	198

	199 /**

	200 * Return the preceding context length. This method is needed to

	201 * support the <code>Transliterator</code> method

	202 * <code>getMaximumContextLength()</code>. Internally, this is

	203 * implemented as the anteContextLength, optionally plus one if

	204 * there is a start anchor. The one character anchor gap is

	205 * needed to make repeated incremental transliteration with

	206 * anchors work.

	207 */

	208 int32_t TransliterationRule::getContextLength(void) const {

	209 return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);

	210 }

	211

	212 /**

	213 * Internal method. Returns 8-bit index value for this rule.

	214 * This is the low byte of the first character of the key,

	215 * unless the first character of the key is a set. If it's a

	216 * set, or otherwise can match multiple keys, the index value is -1.

	217 */

	218 int16_t TransliterationRule::getIndexValue() const {

	219 if (anteContextLength == pattern.length()) {

	220 // A pattern with just ante context {such as foo)>bar} can

	221 // match any key.

	222 return -1;

	223 }

	224 UChar32 c = pattern.char32At(anteContextLength);

	225 return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);

	226 }

	227

	228 /**

	229 * Internal method. Returns true if this rule matches the given

	230 * index value. The index value is an 8-bit integer, 0..255,

	231 * representing the low byte of the first character of the key.

	232 * It matches this rule if it matches the first character of the

	233 * key, or if the first character of the key is a set, and the set

	234 * contains any character with a low byte equal to the index

	235 * value. If the rule contains only ante context, as in foo)>bar,

	236 * then it will match any key.

	237 */

	238 UBool TransliterationRule::matchesIndexValue(uint8_t v) const {

	239 // Delegate to the key, or if there is none, to the postContext.

	240 // If there is neither then we match any key; return true.

	241 UnicodeMatcher *m = (key != NULL) ? key : postContext;

	242 return (m != NULL) ? m->matchesIndexValue(v) : TRUE;

	243 }

	244

	245 /**

	246 * Return true if this rule masks another rule. If r1 masks r2 then

	247 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks

	248 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".

	249 * "[c]a>x" masks "[dc]a>y".

	250 */

	251 UBool TransliterationRule::masks(const TransliterationRule& r2) const {

	252 /* Rule r1 masks rule r2 if the string formed of the

	253 * antecontext, key, and postcontext overlaps in the following

	254 * way:

	255 *

	256 * r1: aakkkpppp

	257 * r2: aaakkkkkpppp

	258 * ^

	259 *

	260 * The strings must be aligned at the first character of the

	261 * key. The length of r1 to the left of the alignment point

	262 * must be <= the length of r2 to the left; ditto for the

	263 * right. The characters of r1 must equal (or be a superset

	264 * of) the corresponding characters of r2. The superset

	265 * operation should be performed to check for UnicodeSet

	266 * masking.

	267 *

	268 * Anchors: Two patterns that differ only in anchors only

	269 * mask one another if they are exactly equal, and r2 has

	270 * all the anchors r1 has (optionally, plus some). Here Y

	271 * means the row masks the column, N means it doesn't.

	272 *

	273 * ab ^ab ab$ ^ab$

	274 * ab Y Y Y Y

	275 * ^ab N Y N Y

	276 * ab$ N N Y Y

	277 * ^ab$ N N N Y

	278 *

	279 * Post context: {a}b masks ab, but not vice versa, since {a}b

	280 * matches everything ab matches, and {a}b matches {\|a\|}b but ab

	281 * does not. Pre context is different (a{b} does not align with

	282 * ab).

	283 */

	284

	285 /* LIMITATION of the current mask algorithm: Some rule

	286 * maskings are currently not detected. For example,

	287 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO

	288 */

	289

	290 int32_t len = pattern.length();

	291 int32_t left = anteContextLength;

	292 int32_t left2 = r2.anteContextLength;

	293 int32_t right = len - left;

	294 int32_t right2 = r2.pattern.length() - left2;

	295 int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);

	296

	297 // TODO Clean this up -- some logic might be combinable with the

	298 // next statement.

	299

	300 // Test for anchor masking

	301 if (left == left2 && right == right2 &&

	302 keyLength <= r2.keyLength &&

	303 0 == cachedCompare) {

	304 // The following boolean logic implements the table above

	305 return (flags == r2.flags) \|\|

	306 (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) \|\|

	307 ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));

	308 }

	309

	310 return left <= left2 &&

	311 (right < right2 \|\|

	312 (right == right2 && keyLength <= r2.keyLength)) &&

	313 (0 == cachedCompare);

	314 }

	315

	316 static inline int32_t posBefore(const Replaceable& str, int32_t pos) {

	317 return (pos > 0) ?

	318 pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) :

	319 pos - 1;

	320 }

	321

	322 static inline int32_t posAfter(const Replaceable& str, int32_t pos) {

	323 return (pos >= 0 && pos < str.length()) ?

	324 pos + UTF_CHAR_LENGTH(str.char32At(pos)) :

	325 pos + 1;

	326 }

	327

	328 /**

	329 * Attempt a match and replacement at the given position. Return

	330 * the degree of match between this rule and the given text. The

	331 * degree of match may be mismatch, a partial match, or a full

	332 * match. A mismatch means at least one character of the text

	333 * does not match the context or key. A partial match means some

	334 * context and key characters match, but the text is not long

	335 * enough to match all of them. A full match means all context

	336 * and key characters match.

	337 *

	338 * If a full match is obtained, perform a replacement, update pos,

	339 * and return U_MATCH. Otherwise both text and pos are unchanged.

	340 *

	341 * @param text the text

	342 * @param pos the position indices

	343 * @param incremental if TRUE, test for partial matches that may

	344 * be completed by additional text inserted at pos.limit.

	345 * @return one of <code>U_MISMATCH</code>,

	346 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If

	347 * incremental is FALSE then U_PARTIAL_MATCH will not be returned.

	348 */

	349 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,

	350 UTransPosition& pos,

	351 UBool incremental) const {

	352 // Matching and replacing are done in one method because the

	353 // replacement operation needs information obtained during the

	354 // match. Another way to do this is to have the match method

	355 // create a match result struct with relevant offsets, and to pass

	356 // this into the replace method.

	357

	358 // ============================ MATCH ===========================

	359

	360 // Reset segment match data

	361 if (segments != NULL) {

	362 for (int32_t i=0; i<segmentsCount; ++i) {

	363 ((StringMatcher*) segments[i])->resetMatch();

	364 }

	365 }

	366

	367 // int32_t lenDelta, keyLimit;

	368 int32_t keyLimit;

	369

	370 // ------------------------ Ante Context ------------------------

	371

	372 // A mismatch in the ante context, or with the start anchor,

	373 // is an outright U_MISMATCH regardless of whether we are

	374 // incremental or not.

	375 int32_t oText; // offset into 'text'

	376 // int32_t newStart = 0;

	377 int32_t minOText;

	378

	379 // Note (1): We process text in 16-bit code units, rather than

	380 // 32-bit code points. This works because stand-ins are

	381 // always in the BMP and because we are doing a literal match

	382 // operation, which can be done 16-bits at a time.

	383

	384 int32_t anteLimit = posBefore(text, pos.contextStart);

	385

	386 UMatchDegree match;

	387

	388 // Start reverse match at char before pos.start

	389 oText = posBefore(text, pos.start);

	390

	391 if (anteContext != NULL) {

	392 match = anteContext->matches(text, oText, anteLimit, FALSE);

	393 if (match != U_MATCH) {

	394 return U_MISMATCH;

	395 }

	396 }

	397

	398 minOText = posAfter(text, oText);

	399

	400 // ------------------------ Start Anchor ------------------------

	401

	402 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {

	403 return U_MISMATCH;

	404 }

	405

	406 // -------------------- Key and Post Context --------------------

	407

	408 oText = pos.start;

	409

	410 if (key != NULL) {

	411 match = key->matches(text, oText, pos.limit, incremental);

	412 if (match != U_MATCH) {

	413 return match;

	414 }

	415 }

	416

	417 keyLimit = oText;

	418

	419 if (postContext != NULL) {

	420 if (incremental && keyLimit == pos.limit) {

	421 // The key matches just before pos.limit, and there is

	422 // a postContext. Since we are in incremental mode,

	423 // we must assume more characters may be inserted at

	424 // pos.limit -- this is a partial match.

	425 return U_PARTIAL_MATCH;

	426 }

	427

	428 match = postContext->matches(text, oText, pos.contextLimit, incremental) ;

	429 if (match != U_MATCH) {

	430 return match;

	431 }

	432 }

	433

	434 // ------------------------- Stop Anchor ------------------------

	435

	436 if (((flags & ANCHOR_END)) != 0) {

	437 if (oText != pos.contextLimit) {

	438 return U_MISMATCH;

	439 }

	440 if (incremental) {

	441 return U_PARTIAL_MATCH;

	442 }

	443 }

	444

	445 // =========================== REPLACE ==========================

	446

	447 // We have a full match. The key is between pos.start and

	448 // keyLimit.

	449

	450 int32_t newStart;

	451 int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);

	452 int32_t lenDelta = newLength - (keyLimit - pos.start);

	453

	454 oText += lenDelta;

	455 pos.limit += lenDelta;

	456 pos.contextLimit += lenDelta;

	457 // Restrict new value of start to [minOText, min(oText, pos.limit)].

	458 pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart ));

	459 return U_MATCH;

	460 }

	461

	462 /**

	463 * Create a source string that represents this rule. Append it to the

	464 * given string.

	465 */

	466 UnicodeString& TransliterationRule::toRule(UnicodeString& rule,

	467 UBool escapeUnprintable) const {

	468

	469 // Accumulate special characters (and non-specials following them)

	470 // into quoteBuf. Append quoteBuf, within single quotes, when

	471 // a non-quoted element must be inserted.

	472 UnicodeString str, quoteBuf;

	473

	474 // Do not emit the braces '{' '}' around the pattern if there

	475 // is neither anteContext nor postContext.

	476 UBool emitBraces =

	477 (anteContext != NULL) \|\| (postContext != NULL);

	478

	479 // Emit start anchor

	480 if ((flags & ANCHOR_START) != 0) {

	481 rule.append((UChar)94/^/);

	482 }

	483

	484 // Emit the input pattern

	485 ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);

	486

	487 if (emitBraces) {

	488 ICU_Utility::appendToRule(rule, (UChar) 0x007B /{/, TRUE, escapeUnprin table, quoteBuf);

	489 }

	490

	491 ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);

	492

	493 if (emitBraces) {

	494 ICU_Utility::appendToRule(rule, (UChar) 0x007D /}/, TRUE, escapeUnprin table, quoteBuf);

	495 }

	496

	497 ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);

	498

	499 // Emit end anchor

	500 if ((flags & ANCHOR_END) != 0) {

	501 rule.append((UChar)36/$/);

	502 }

	503

	504 ICU_Utility::appendToRule(rule, FORWARD_OP, TRUE, escapeUnprintable, quoteBu f);

	505

	506 // Emit the output pattern

	507

	508 ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),

	509 TRUE, escapeUnprintable, quoteBuf);

	510

	511 ICU_Utility::appendToRule(rule, (UChar) 0x003B /;/, TRUE, escapeUnprintabl e, quoteBuf);

	512

	513 return rule;

	514 }

	515

	516 void TransliterationRule::setData(const TransliterationRuleData* d) {

	517 data = d;

	518 if (anteContext != NULL) anteContext->setData(d);

	519 if (postContext != NULL) postContext->setData(d);

	520 if (key != NULL) key->setData(d);

	521 // assert(output != NULL);

	522 output->setData(d);

	523 // Don't have to do segments since they are in the context or key

	524 }

	525

	526 /**

	527 * Union the set of all characters that may be modified by this rule

	528 * into the given set.

	529 */

	530 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {

	531 int32_t limit = anteContextLength + keyLength;

	532 for (int32_t i=anteContextLength; i<limit; ) {

	533 UChar32 ch = pattern.char32At(i);

	534 i += UTF_CHAR_LENGTH(ch);

	535 const UnicodeMatcher* matcher = data->lookupMatcher(ch);

	536 if (matcher == NULL) {

	537 toUnionTo.add(ch);

	538 } else {

	539 matcher->addMatchSetTo(toUnionTo);

	540 }

	541 }

	542 }

	543

	544 /**

	545 * Union the set of all characters that may be emitted by this rule

	546 * into the given set.

	547 */

	548 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {

	549 output->toReplacer()->addReplacementSetTo(toUnionTo);

	550 }

	551

	552 U_NAMESPACE_END

	553

	554 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

	555

	556 //eof

OLD	NEW

« no previous file with comments | « icu46/source/i18n/rbt_rule.h ('k') | icu46/source/i18n/rbt_set.h » ('j') | no next file with comments »