icu46/source/i18n/rbt_set.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/rbt_set.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 1999-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 11/17/99 aliu Creation.

	8 **********************************************************************

	9 */

	10

	11 #include "unicode/utypes.h"

	12

	13 #if !UCONFIG_NO_TRANSLITERATION

	14

	15 #include "unicode/unistr.h"

	16 #include "unicode/uniset.h"

	17 #include "rbt_set.h"

	18 #include "rbt_rule.h"

	19 #include "cmemory.h"

	20 #include "putilimp.h"

	21

	22 U_CDECL_BEGIN

	23 static void U_CALLCONV _deleteRule(void *rule) {

	24 delete (U_NAMESPACE_QUALIFIER TransliterationRule *)rule;

	25 }

	26 U_CDECL_END

	27

	28 //----------------------------------------------------------------------

	29 // BEGIN Debugging support

	30 //----------------------------------------------------------------------

	31

	32 // #define DEBUG_RBT

	33

	34 #ifdef DEBUG_RBT

	35 #include <stdio.h>

	36 #include "charstr.h"

	37

	38 /**

	39 * @param appendTo result is appended to this param.

	40 * @param input the string being transliterated

	41 * @param pos the index struct

	42 */

	43 static UnicodeString& _formatInput(UnicodeString &appendTo,

	44 const UnicodeString& input,

	45 const UTransPosition& pos) {

	46 // Output a string of the form aaa{bbb\|ccc\|ddd}eee, where

	47 // the {} indicate the context start and limit, and the \|\|

	48 // indicate the start and limit.

	49 if (0 <= pos.contextStart &&

	50 pos.contextStart <= pos.start &&

	51 pos.start <= pos.limit &&

	52 pos.limit <= pos.contextLimit &&

	53 pos.contextLimit <= input.length()) {

	54

	55 UnicodeString a, b, c, d, e;

	56 input.extractBetween(0, pos.contextStart, a);

	57 input.extractBetween(pos.contextStart, pos.start, b);

	58 input.extractBetween(pos.start, pos.limit, c);

	59 input.extractBetween(pos.limit, pos.contextLimit, d);

	60 input.extractBetween(pos.contextLimit, input.length(), e);

	61 appendTo.append(a).append((UChar)123/{/).append(b).

	62 append((UChar)124/\|/).append(c).append((UChar)124/\|/).append(d).

	63 append((UChar)125/}/).append(e);

	64 } else {

	65 appendTo.append("INVALID UTransPosition");

	66 //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +

	67 // pos.contextStart + ", s=" + pos.start + ", l=" +

	68 // pos.limit + ", cl=" + pos.contextLimit + "} on " +

	69 // input);

	70 }

	71 return appendTo;

	72 }

	73

	74 // Append a hex string to the target

	75 UnicodeString& _appendHex(uint32_t number,

	76 int32_t digits,

	77 UnicodeString& target) {

	78 static const UChar digitString[] = {

	79 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,

	80 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0

	81 };

	82 while (digits--) {

	83 target += digitString[(number >> (digits*4)) & 0xF];

	84 }

	85 return target;

	86 }

	87

	88 // Replace nonprintable characters with unicode escapes

	89 UnicodeString& _escape(const UnicodeString &source,

	90 UnicodeString &target) {

	91 for (int32_t i = 0; i < source.length(); ) {

	92 UChar32 ch = source.char32At(i);

	93 i += UTF_CHAR_LENGTH(ch);

	94 if (ch < 0x09 \|\| (ch > 0x0A && ch < 0x20)\|\| ch > 0x7E) {

	95 if (ch <= 0xFFFF) {

	96 target += "\\u";

	97 _appendHex(ch, 4, target);

	98 } else {

	99 target += "\\U";

	100 _appendHex(ch, 8, target);

	101 }

	102 } else {

	103 target += ch;

	104 }

	105 }

	106 return target;

	107 }

	108

	109 inline void _debugOut(const char* msg, TransliterationRule* rule,

	110 const Replaceable& theText, UTransPosition& pos) {

	111 UnicodeString buf(msg, "");

	112 if (rule) {

	113 UnicodeString r;

	114 rule->toRule(r, TRUE);

	115 buf.append((UChar)32).append(r);

	116 }

	117 buf.append(UnicodeString(" => ", ""));

	118 UnicodeString* text = (UnicodeString*)&theText;

	119 _formatInput(buf, *text, pos);

	120 UnicodeString esc;

	121 _escape(buf, esc);

	122 CharString cbuf(esc);

	123 printf("%s\n", (const char*) cbuf);

	124 }

	125

	126 #else

	127 #define _debugOut(msg, rule, theText, pos)

	128 #endif

	129

	130 //----------------------------------------------------------------------

	131 // END Debugging support

	132 //----------------------------------------------------------------------

	133

	134 // Fill the precontext and postcontext with the patterns of the rules

	135 // that are masking one another.

	136 static void maskingError(const U_NAMESPACE_QUALIFIER TransliterationRule& rule1,

	137 const U_NAMESPACE_QUALIFIER TransliterationRule& rule2,

	138 UParseError& parseError) {

	139 U_NAMESPACE_QUALIFIER UnicodeString r;

	140 int32_t len;

	141

	142 parseError.line = parseError.offset = -1;

	143

	144 // for pre-context

	145 rule1.toRule(r, FALSE);

	146 len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);

	147 r.extract(0, len, parseError.preContext);

	148 parseError.preContext[len] = 0;

	149

	150 //for post-context

	151 r.truncate(0);

	152 rule2.toRule(r, FALSE);

	153 len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);

	154 r.extract(0, len, parseError.postContext);

	155 parseError.postContext[len] = 0;

	156 }

	157

	158 U_NAMESPACE_BEGIN

	159

	160 /**

	161 * Construct a new empty rule set.

	162 */

	163 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() {

	164 ruleVector = new UVector(&_deleteRule, NULL, status);

	165 if (U_FAILURE(status)) {

	166 return;

	167 }

	168 if (ruleVector == NULL) {

	169 status = U_MEMORY_ALLOCATION_ERROR;

	170 }

	171 rules = NULL;

	172 maxContextLength = 0;

	173 }

	174

	175 /**

	176 * Copy constructor.

	177 */

	178 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& oth er) :

	179 UMemory(other),

	180 ruleVector(0),

	181 rules(0),

	182 maxContextLength(other.maxContextLength) {

	183

	184 int32_t i, len;

	185 uprv_memcpy(index, other.index, sizeof(index));

	186 UErrorCode status = U_ZERO_ERROR;

	187 ruleVector = new UVector(&_deleteRule, NULL, status);

	188 if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) {

	189 len = other.ruleVector->size();

	190 for (i=0; i<len && U_SUCCESS(status); ++i) {

	191 TransliterationRule tempTranslitRule = new TransliterationRule((Tr ansliterationRule*)other.ruleVector->elementAt(i));

	192 // Null pointer test

	193 if (tempTranslitRule == NULL) {

	194 status = U_MEMORY_ALLOCATION_ERROR;

	195 break;

	196 }

	197 ruleVector->addElement(tempTranslitRule, status);

	198 if (U_FAILURE(status)) {

	199 break;

	200 }

	201 }

	202 }

	203 if (other.rules != 0 && U_SUCCESS(status)) {

	204 UParseError p;

	205 freeze(p, status);

	206 }

	207 }

	208

	209 /**

	210 * Destructor.

	211 */

	212 TransliterationRuleSet::~TransliterationRuleSet() {

	213 delete ruleVector; // This deletes the contained rules

	214 uprv_free(rules);

	215 }

	216

	217 void TransliterationRuleSet::setData(const TransliterationRuleData* d) {

	218 /**

	219 * We assume that the ruleset has already been frozen.

	220 */

	221 int32_t len = index[256]; // see freeze()

	222 for (int32_t i=0; i<len; ++i) {

	223 rules[i]->setData(d);

	224 }

	225 }

	226

	227 /**

	228 * Return the maximum context length.

	229 * @return the length of the longest preceding context.

	230 */

	231 int32_t TransliterationRuleSet::getMaximumContextLength(void) const {

	232 return maxContextLength;

	233 }

	234

	235 /**

	236 * Add a rule to this set. Rules are added in order, and order is

	237 * significant. The last call to this method must be followed by

	238 * a call to <code>freeze()</code> before the rule set is used.

	239 *

	240 * <p>If freeze() has already been called, calling addRule()

	241 * unfreezes the rules, and freeze() must be called again.

	242 *

	243 * @param adoptedRule the rule to add

	244 */

	245 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,

	246 UErrorCode& status) {

	247 if (U_FAILURE(status)) {

	248 delete adoptedRule;

	249 return;

	250 }

	251 ruleVector->addElement(adoptedRule, status);

	252

	253 int32_t len;

	254 if ((len = adoptedRule->getContextLength()) > maxContextLength) {

	255 maxContextLength = len;

	256 }

	257

	258 uprv_free(rules);

	259 rules = 0;

	260 }

	261

	262 /**

	263 * Check this for masked rules and index it to optimize performance.

	264 * The sequence of operations is: (1) add rules to a set using

	265 * <code>addRule()</code>; (2) freeze the set using

	266 * <code>freeze()</code>; (3) use the rule set. If

	267 * <code>addRule()</code> is called after calling this method, it

	268 * invalidates this object, and this method must be called again.

	269 * That is, <code>freeze()</code> may be called multiple times,

	270 * although for optimal performance it shouldn't be.

	271 */

	272 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {

	273 /* Construct the rule array and index table. We reorder the

	274 * rules by sorting them into 256 bins. Each bin contains all

	275 * rules matching the index value for that bin. A rule

	276 * matches an index value if string whose first key character

	277 * has a low byte equal to the index value can match the rule.

	278 *

	279 * Each bin contains zero or more rules, in the same order

	280 * they were found originally. However, the total rules in

	281 * the bins may exceed the number in the original vector,

	282 * since rules that have a variable as their first key

	283 * character will generally fall into more than one bin.

	284 *

	285 * That is, each bin contains all rules that either have that

	286 * first index value as their first key character, or have

	287 * a set containing the index value as their first character.

	288 */

	289 int32_t n = ruleVector->size();

	290 int32_t j;

	291 int16_t x;

	292 UVector v(2*n, status); // heuristic; adjust as needed

	293

	294 if (U_FAILURE(status)) {

	295 return;

	296 }

	297

	298 /* Precompute the index values. This saves a LOT of time.

	299 * Be careful not to call malloc(0).

	300 */

	301 int16_t* indexValue = (int16_t) uprv_malloc( sizeof(int16_t) (n > 0 ? n : 1) );

	302 /* test for NULL */

	303 if (indexValue == 0) {

	304 status = U_MEMORY_ALLOCATION_ERROR;

	305 return;

	306 }

	307 for (j=0; j<n; ++j) {

	308 TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j) ;

	309 indexValue[j] = r->getIndexValue();

	310 }

	311 for (x=0; x<256; ++x) {

	312 index[x] = v.size();

	313 for (j=0; j<n; ++j) {

	314 if (indexValue[j] >= 0) {

	315 if (indexValue[j] == x) {

	316 v.addElement(ruleVector->elementAt(j), status);

	317 }

	318 } else {

	319 // If the indexValue is < 0, then the first key character is

	320 // a set, and we must use the more time-consuming

	321 // matchesIndexValue check. In practice this happens

	322 // rarely, so we seldom tread this code path.

	323 TransliterationRule* r = (TransliterationRule*) ruleVector->elem entAt(j);

	324 if (r->matchesIndexValue((uint8_t)x)) {

	325 v.addElement(r, status);

	326 }

	327 }

	328 }

	329 }

	330 uprv_free(indexValue);

	331 index[256] = v.size();

	332

	333 /* Freeze things into an array.

	334 */

	335 uprv_free(rules); // Contains alias pointers

	336

	337 /* You can't do malloc(0)! */

	338 if (v.size() == 0) {

	339 rules = NULL;

	340 return;

	341 }

	342 rules = (TransliterationRule *)uprv_malloc(v.size() sizeof(Transliteratio nRule *));

	343 /* test for NULL */

	344 if (rules == 0) {

	345 status = U_MEMORY_ALLOCATION_ERROR;

	346 return;

	347 }

	348 for (j=0; j<v.size(); ++j) {

	349 rules[j] = (TransliterationRule*) v.elementAt(j);

	350 }

	351

	352 // TODO Add error reporting that indicates the rules that

	353 // are being masked.

	354 //UnicodeString errors;

	355

	356 /* Check for masking. This is MUCH faster than our old check,

	357 * which was each rule against each following rule, since we

	358 * only have to check for masking within each bin now. It's

	359 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule

	360 * count, and n2 is the per-bin rule count. But n2<<n1, so

	361 * it's a big win.

	362 */

	363 for (x=0; x<256; ++x) {

	364 for (j=index[x]; j<index[x+1]-1; ++j) {

	365 TransliterationRule* r1 = rules[j];

	366 for (int32_t k=j+1; k<index[x+1]; ++k) {

	367 TransliterationRule* r2 = rules[k];

	368 if (r1->masks(*r2)) {

	369 //\| if (errors == null) {

	370 //\| errors = new StringBuffer();

	371 //\| } else {

	372 //\| errors.append("\n");

	373 //\| }

	374 //\| errors.append("Rule " + r1 + " masks " + r2);

	375 status = U_RULE_MASK_ERROR;

	376 maskingError(r1, r2, parseError);

	377 return;

	378 }

	379 }

	380 }

	381 }

	382

	383 //if (errors != null) {

	384 // throw new IllegalArgumentException(errors.toString());

	385 //}

	386 }

	387

	388 /**

	389 * Transliterate the given text with the given UTransPosition

	390 * indices. Return TRUE if the transliteration should continue

	391 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).

	392 * Note that FALSE is only ever returned if isIncremental is TRUE.

	393 * @param text the text to be transliterated

	394 * @param pos the position indices, which will be updated

	395 * @param incremental if TRUE, assume new text may be inserted

	396 * at index.limit, and return FALSE if thre is a partial match.

	397 * @return TRUE unless a U_PARTIAL_MATCH has been obtained,

	398 * indicating that transliteration should stop until more text

	399 * arrives.

	400 */

	401 UBool TransliterationRuleSet::transliterate(Replaceable& text,

	402 UTransPosition& pos,

	403 UBool incremental) {

	404 int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);

	405 for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {

	406 UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);

	407 switch (m) {

	408 case U_MATCH:

	409 _debugOut("match", rules[i], text, pos);

	410 return TRUE;

	411 case U_PARTIAL_MATCH:

	412 _debugOut("partial match", rules[i], text, pos);

	413 return FALSE;

	414 default: /* Ram: added default to make GCC happy */

	415 break;

	416 }

	417 }

	418 // No match or partial match from any rule

	419 pos.start += UTF_CHAR_LENGTH(text.char32At(pos.start));

	420 _debugOut("no match", NULL, text, pos);

	421 return TRUE;

	422 }

	423

	424 /**

	425 * Create rule strings that represents this rule set.

	426 */

	427 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,

	428 UBool escapeUnprintable) const {

	429 int32_t i;

	430 int32_t count = ruleVector->size();

	431 ruleSource.truncate(0);

	432 for (i=0; i<count; ++i) {

	433 if (i != 0) {

	434 ruleSource.append((UChar) 0x000A /\n/);

	435 }

	436 TransliterationRule *r =

	437 (TransliterationRule*) ruleVector->elementAt(i);

	438 r->toRule(ruleSource, escapeUnprintable);

	439 }

	440 return ruleSource;

	441 }

	442

	443 /**

	444 * Return the set of all characters that may be modified

	445 * (getTarget=false) or emitted (getTarget=true) by this set.

	446 */

	447 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,

	448 UBool getTarget) const

	449 {

	450 result.clear();

	451 int32_t count = ruleVector->size();

	452 for (int32_t i=0; i<count; ++i) {

	453 TransliterationRule* r =

	454 (TransliterationRule*) ruleVector->elementAt(i);

	455 if (getTarget) {

	456 r->addTargetSetTo(result);

	457 } else {

	458 r->addSourceSetTo(result);

	459 }

	460 }

	461 return result;

	462 }

	463

	464 U_NAMESPACE_END

	465

	466 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

OLD	NEW

« no previous file with comments | « icu46/source/i18n/rbt_set.h ('k') | icu46/source/i18n/rbtz.cpp » ('j') | no next file with comments »